diff --git a/lisa/microsoft/runbook/openvmm/openvmm-azure-smoke.yml b/lisa/microsoft/runbook/openvmm/openvmm-azure-smoke.yml new file mode 100644 index 0000000000..5c80c8207f --- /dev/null +++ b/lisa/microsoft/runbook/openvmm/openvmm-azure-smoke.yml @@ -0,0 +1,117 @@ +name: openvmm azure smoke +extension: + - ../../testsuites +variable: + - name: host_admin_username + value: lisatest + - name: host_admin_password + value: "" + is_secret: true + - name: host_admin_private_key_file + value: "" + is_secret: true + - name: guest_admin_username + value: lisatest + - name: guest_admin_password + value: "" + is_secret: true + - name: guest_admin_private_key_file + value: "" + is_secret: true + - name: guest_extra_user_data + value: "" + - name: subscription_id + value: "" + - name: location + value: "westus3" + - name: marketplace_image + value: "" + - name: vm_size + value: "" + - name: openvmm_binary + value: /usr/local/bin/openvmm + - name: openvmm_install_path + value: /usr/local/bin/openvmm + - name: openvmm_installer_repo + value: https://github.com/microsoft/openvmm.git + - name: openvmm_installer_ref + value: "" + - name: openvmm_installer_force_install + value: false + - name: openvmm_host_working_dir + value: /var/tmp + - name: uefi_firmware_path + value: "" + - name: uefi_firmware_is_remote_path + value: false + - name: disk_img_path + value: "" + - name: disk_img_is_remote_path + value: false + - name: tap_name + value: tap0 + - name: bridge_name + value: ovmbr0 + - name: tap_host_cidr + value: 10.0.0.1/24 + - name: forwarded_port + value: 60022 +notifier: + - type: html +transformer: + - type: openvmm_installer + phase: environment_connected + installer: + type: source + repo: $(openvmm_installer_repo) + ref: $(openvmm_installer_ref) + force_install: $(openvmm_installer_force_install) + install_path: $(openvmm_install_path) +platform: + - type: azure + admin_username: $(host_admin_username) + admin_password: $(host_admin_password) + admin_private_key_file: $(host_admin_private_key_file) + guest_enabled: true + guests: + - type: openvmm + use_parent_capability: false + username: $(guest_admin_username) + password: $(guest_admin_password) + private_key_file: $(guest_admin_private_key_file) + cloud_init: + extra_user_data: $(guest_extra_user_data) + lisa_working_dir: $(openvmm_host_working_dir) + openvmm_binary: $(openvmm_binary) + boot_mode: uefi + capability: + core_count: 2 + memory_mb: 2048 + uefi: + firmware_path: $(uefi_firmware_path) + firmware_is_remote_path: $(uefi_firmware_is_remote_path) + disk_img: $(disk_img_path) + disk_img_is_remote_path: $(disk_img_is_remote_path) + serial: + mode: file + network: + mode: tap + address_mode: discover + tap_name: $(tap_name) + bridge_name: $(bridge_name) + tap_host_cidr: $(tap_host_cidr) + forward_ssh_port: true + forwarded_port: $(forwarded_port) + azure: + subscription_id: $(subscription_id) + requirement: + azure: + marketplace: $(marketplace_image) + location: $(location) + vm_size: $(vm_size) +testcase: + - criteria: + name: + - verify_openvmm_guest_boot + - verify_openvmm_restart_via_platform + - verify_openvmm_stop_start_in_platform diff --git a/lisa/microsoft/testsuites/openvmm/openvmm.py b/lisa/microsoft/testsuites/openvmm/openvmm.py index 90e0afb848..90bc714bf8 100644 --- a/lisa/microsoft/testsuites/openvmm/openvmm.py +++ b/lisa/microsoft/testsuites/openvmm/openvmm.py @@ -1,60 +1,103 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -from assertpy import assert_that +from typing import Any -from lisa import Node, SkippedException, TestCaseMetadata, TestSuite, TestSuiteMetadata +from lisa import ( + Logger, + RemoteNode, + SkippedException, + TestCaseMetadata, + TestSuite, + TestSuiteMetadata, + simple_requirement, +) +from lisa.environment import EnvironmentStatus from lisa.features import StartStop from lisa.sut_orchestrator.openvmm.node import OpenVmmGuestNode -from lisa.testsuite import simple_requirement +from lisa.tools import Uname @TestSuiteMetadata( area="openvmm", category="functional", description=""" - Smoke coverage for OpenVMM guest provisioning and platform lifecycle. + This test suite validates OpenVMM guests running on a prepared L1 host. """, ) -class OpenVmmSmokeTestSuite(TestSuite): +class OpenVmmPlatformSuite(TestSuite): + def before_case(self, log: Logger, **kwargs: Any) -> None: + node = kwargs["node"] + if not isinstance(node, OpenVmmGuestNode): + raise SkippedException( + "This suite only applies to OpenVMM guest nodes. " + f"Actual node type: {type(node).__name__}." + ) + @TestCaseMetadata( description=""" - Validate an OpenVMM guest is provisioned, reachable over SSH, and can - execute a simple command after launch. + This case validates that an OpenVMM guest is reachable over SSH and that + the guest booted successfully. """, - priority=1, - requirement=simple_requirement(supported_features=[StartStop]), + priority=0, + requirement=simple_requirement( + environment_status=EnvironmentStatus.Deployed, + ), ) - def verify_openvmm_provisioning(self, node: Node) -> None: - openvmm_node = self._get_openvmm_guest(node) + def verify_openvmm_guest_boot( + self, + log: Logger, + node: RemoteNode, + ) -> None: + kernel_release = node.tools[Uname].get_linux_information().kernel_version_raw + log.info(f"Connected to OpenVMM guest kernel {kernel_release}") - result = openvmm_node.execute("echo openvmm-smoke", shell=True) + @TestCaseMetadata( + description=""" + This case validates that platform restart keeps the OpenVMM guest + reachable after the restart. + """, + priority=0, + requirement=simple_requirement( + environment_status=EnvironmentStatus.Deployed, + supported_features=[StartStop], + ), + ) + def verify_openvmm_restart_via_platform( + self, + log: Logger, + node: RemoteNode, + ) -> None: + start_stop = node.features[StartStop] + start_stop.restart() - result.assert_exit_code() - assert_that(result.stdout.strip()).is_equal_to("openvmm-smoke") + kernel_release = node.tools[Uname].get_linux_information().kernel_version_raw + log.info(f"OpenVMM guest returned after restart on kernel {kernel_release}") @TestCaseMetadata( description=""" - Validate the OpenVMM StartStop feature can stop and start a guest while - preserving SSH connectivity for subsequent command execution. + This case validates that platform stop/start keeps the OpenVMM guest + reachable for subsequent command execution. """, - priority=1, - requirement=simple_requirement(supported_features=[StartStop]), + priority=0, + requirement=simple_requirement( + environment_status=EnvironmentStatus.Deployed, + supported_features=[StartStop], + ), ) - def verify_openvmm_stop_start_in_platform(self, node: Node) -> None: - openvmm_node = self._get_openvmm_guest(node) - - start_stop = openvmm_node.features[StartStop] + def verify_openvmm_stop_start_in_platform( + self, + log: Logger, + node: RemoteNode, + ) -> None: + start_stop = node.features[StartStop] + log.info("Stopping OpenVMM guest via platform") start_stop.stop(wait=True) + log.info("Starting OpenVMM guest via platform") start_stop.start(wait=True) - result = openvmm_node.execute("echo openvmm-recovered", shell=True) - - result.assert_exit_code() - assert_that(result.stdout.strip()).is_equal_to("openvmm-recovered") - - def _get_openvmm_guest(self, node: Node) -> OpenVmmGuestNode: - if not isinstance(node, OpenVmmGuestNode): - raise SkippedException("This suite only applies to OpenVMM guest nodes.") - - return node + kernel_release = node.tools[Uname].get_linux_information().kernel_version_raw + log.info( + f"OpenVMM guest returned after platform stop/start on kernel " + f"{kernel_release}" + ) diff --git a/lisa/node.py b/lisa/node.py index 39e070487c..d0a5d6f71c 100644 --- a/lisa/node.py +++ b/lisa/node.py @@ -355,12 +355,30 @@ def execute_async( ) def cleanup(self) -> None: + for guest in self.guests: + try: + guest.cleanup() + except Exception: + self.log.exception( + "failed to clean up guest " + f"'{guest.name or guest.index}' while cleaning node " + f"'{self.name}'. Continuing parent cleanup." + ) self.log.debug("cleaning up...") if hasattr(self, "_log_handler") and self._log_handler: remove_handler(self._log_handler, self.log) self._log_handler.close() def close(self) -> None: + for guest in self.guests: + try: + guest.close() + except Exception: + self.log.exception( + "failed to close guest " + f"'{guest.name or guest.index}' while closing node " + f"'{self.name}'. Continuing parent close." + ) self.log.debug("closing node connection...") if self._shell: self._shell.close() @@ -553,7 +571,12 @@ def mark_dirty(self) -> None: self._is_dirty = True def test_connection(self) -> bool: - assert self._shell + if not self._shell: + self.log.debug( + f"connection test failed for node '{self.name}' because its " + "shell is not initialized" + ) + return False if not self._shell.is_remote: return True self.log.debug("testing connection...") diff --git a/lisa/runners/lisa_runner.py b/lisa/runners/lisa_runner.py index a7b2117f27..bcb0d6b12e 100644 --- a/lisa/runners/lisa_runner.py +++ b/lisa/runners/lisa_runner.py @@ -200,6 +200,13 @@ def _dispatch_test_result( # run on deployed environment can_run_results = [x for x in can_run_results if x.can_run] if environment.status == EnvironmentStatus.Deployed and can_run_results: + if self._guest_enabled: + return self._generate_task( + task_method=self._initialize_environment_task, + environment=environment, + test_results=can_run_results[:1], + ) + selected_test_results = self._get_test_result_to_run( test_results=test_results, environment=environment ) @@ -338,6 +345,9 @@ def _initialize_environment_task( phase=constants.TRANSFORMER_PHASE_ENVIRONMENT_CONNECTED, environment=environment, ) + if self._guest_enabled: + guest_environment = environment.get_guest_environment() + guest_environment.nodes.initialize() except Exception as e: self._attach_failed_environment_to_result( environment=environment, @@ -636,8 +646,10 @@ def _get_runnable_test_results( ) and ( environment_status is None - or x.runtime_data.metadata.requirement.environment_status - == environment_status + or self._matches_environment_status( + x.runtime_data.metadata.requirement.environment_status, + environment_status, + ) ) ] if environment: @@ -685,6 +697,23 @@ def _get_runnable_test_results( results = self._sort_test_results(results) return results + def _matches_environment_status( + self, + requirement_status: EnvironmentStatus, + actual_status: EnvironmentStatus, + ) -> bool: + if requirement_status == actual_status: + return True + + if ( + self._guest_enabled + and actual_status == EnvironmentStatus.Connected + and requirement_status == EnvironmentStatus.Deployed + ): + return True + + return False + def _get_test_result_to_run( self, test_results: List[TestResult], environment: Environment ) -> List[TestResult]: diff --git a/lisa/sut_orchestrator/azure/platform_.py b/lisa/sut_orchestrator/azure/platform_.py index a0c51b38c7..deecce1c33 100644 --- a/lisa/sut_orchestrator/azure/platform_.py +++ b/lisa/sut_orchestrator/azure/platform_.py @@ -802,7 +802,9 @@ def _get_node_information(self, node: Node) -> Dict[str, str]: # noqa: C901 # Guest nodes (like WslContainerNode) don't have features attribute # Skip security profile collection for guest nodes - if hasattr(node, "features"): + if hasattr(node, "features") and node.features.is_supported( + SecurityProfile + ): security_profile = node.features[SecurityProfile].get_settings() else: security_profile = None @@ -968,7 +970,11 @@ def _get_kernel_version(self, node: Node) -> str: linux_information = node.tools[Uname].get_linux_information() result = linux_information.kernel_version_raw elif not node.is_connected or node.is_posix: - if not result and hasattr(node, ATTRIBUTE_FEATURES): + if ( + not result + and hasattr(node, ATTRIBUTE_FEATURES) + and node.features.is_supported(features.SerialConsole) + ): # try to get kernel version in Azure. use it, when uname doesn't work node.log.debug("detecting kernel version from serial log...") serial_console = node.features[features.SerialConsole] @@ -1004,7 +1010,11 @@ def _get_wala_version(self, node: Node) -> str: node.log.debug(f"error on run waagent: {e}") if not node.is_connected or node.is_posix: - if not result and hasattr(node, ATTRIBUTE_FEATURES): + if ( + not result + and hasattr(node, ATTRIBUTE_FEATURES) + and node.features.is_supported(features.SerialConsole) + ): node.log.debug("detecting wala agent version from serial log...") serial_console = node.features[features.SerialConsole] result = serial_console.get_matched_str(WALA_VERSION_PATTERN) diff --git a/lisa/sut_orchestrator/openvmm/context.py b/lisa/sut_orchestrator/openvmm/context.py index 3cf4c6881a..cdeba7a493 100644 --- a/lisa/sut_orchestrator/openvmm/context.py +++ b/lisa/sut_orchestrator/openvmm/context.py @@ -5,6 +5,7 @@ from typing import Any, Dict, List, Optional from lisa.node import Node +from lisa.util import LisaException @dataclass @@ -26,6 +27,7 @@ class NodeContext: forwarding_interface: str = "" tap_created: bool = False tap_bridge_created: bool = False + tap_bridge_netfilter_disabled: bool = False tap_dhcp_input_rule_added: bool = False tap_dnsmasq_pid_file: str = "" tap_dnsmasq_lease_file: str = "" @@ -33,5 +35,29 @@ class NodeContext: command_line: str = "" +@dataclass +class OpenVmmHostContext: + original_ip_forward_value: str = "" + active_forwarding_count: int = 0 + original_bridge_netfilter_values: Dict[str, str] = field(default_factory=dict) + active_bridge_netfilter_count: int = 0 + + def get_node_context(node: Node) -> NodeContext: return node.get_context(NodeContext) + + +def get_host_context(node: Node) -> OpenVmmHostContext: + context_attr = "_openvmm_host_context" + if not hasattr(node, context_attr): + setattr(node, context_attr, OpenVmmHostContext()) + + context = getattr(node, context_attr) + if not isinstance(context, OpenVmmHostContext): + raise LisaException( + "unexpected OpenVMM host context type " + f"'{type(context).__name__}' stored in '{context_attr}'. Clear " + "the stale attribute or ensure only OpenVMM stores " + "OpenVmmHostContext in this slot." + ) + return context diff --git a/lisa/sut_orchestrator/openvmm/node.py b/lisa/sut_orchestrator/openvmm/node.py index 864c74dc81..936a540332 100644 --- a/lisa/sut_orchestrator/openvmm/node.py +++ b/lisa/sut_orchestrator/openvmm/node.py @@ -2,30 +2,60 @@ # Licensed under the MIT license. import hashlib +import io +import ipaddress +import os import shlex -from pathlib import Path, PurePath -from typing import Any, List, Optional, Type, cast +import tempfile +import uuid +from abc import ABC, abstractmethod +from pathlib import Path, PurePath, PurePosixPath, PureWindowsPath +from typing import Any, Dict, List, Optional, Type, cast -from lisa import schema, search_space +import yaml + +from lisa import constants, schema, search_space from lisa.feature import Features from lisa.node import Node, RemoteNode -from lisa.tools import Kill, Mkdir, OpenVmm +from lisa.tools import Dnsmasq, Ip, Kill, Mkdir, Modprobe, OpenVmm, Rm from lisa.tools.openvmm import OpenVmmLaunchConfig from lisa.util import ( LisaException, LisaTimeoutException, check_till_timeout, create_timer, + get_public_key_data, ) from lisa.util.logger import Logger from lisa.util.shell import wait_tcp_port_ready from .. import OPENVMM -from .context import get_node_context -from .schema import OPENVMM_NETWORK_MODE_USER, OpenVmmGuestNodeSchema +from .context import NodeContext, get_host_context, get_node_context +from .schema import ( + OPENVMM_ADDRESS_MODE_STATIC, + OPENVMM_NETWORK_MODE_TAP, + OPENVMM_NETWORK_MODE_USER, + OpenVmmGuestNodeSchema, + OpenVmmNetworkSchema, +) from .start_stop import StartStop +# Allow slower guest boot and reconnect paths on loaded L1 hosts. OPENVMM_CONNECTION_TIMEOUT = 300 +# Allow DHCP lease discovery enough time after OpenVMM launch. +OPENVMM_IP_DISCOVERY_TIMEOUT = 300 +# Capture enough recent log lines to include the relevant launch or boot failure. +OPENVMM_LOG_TAIL_LINES = 40 +OPENVMM_DHCP_SERVER_PORT = 67 +OPENVMM_BRIDGE_NETFILTER_KEYS = [ + "net.bridge.bridge-nf-call-iptables", + "net.bridge.bridge-nf-call-arptables", + "net.bridge.bridge-nf-call-ip6tables", +] + + +def _get_tap_host_interface_name(network: OpenVmmNetworkSchema) -> str: + return network.bridge_name or network.tap_name def _countspace_to_int(value: search_space.CountSpace) -> int: @@ -39,6 +69,33 @@ def _countspace_to_int(value: search_space.CountSpace) -> int: return chosen +class GuestIpResolver(ABC): + @abstractmethod + def resolve( + self, + host: Node, + node_context: Any, + network: OpenVmmNetworkSchema, + log: Logger, + ) -> str: + pass + + +class StaticAddressResolver(GuestIpResolver): + def resolve( + self, + host: Node, + node_context: Any, + network: OpenVmmNetworkSchema, + log: Logger, + ) -> str: + if not network.guest_address: + raise LisaException( + "guest_address is required when address_mode is 'static'" + ) + return network.guest_address + + class OpenVmmController: def __init__(self, node: "OpenVmmGuestNode") -> None: self._node = node @@ -92,12 +149,19 @@ def get_openvmm_tool(self, binary_path: str) -> OpenVmm: def launch(self, node: "OpenVmmGuestNode", log: Logger) -> None: runbook = cast(OpenVmmGuestNodeSchema, node.runbook) node_context = get_node_context(node) + self._prepare_tap_network(runbook.network, node_context) launch_config = OpenVmmLaunchConfig( uefi_firmware_path=node_context.uefi_firmware_path, disk_img_path=node_context.disk_img_path, + dvd_disk_paths=( + [node_context.cloud_init_file_path] + if node_context.cloud_init_file_path + else [] + ), processors=_countspace_to_int(node.capability.core_count), memory_mb=_countspace_to_int(node.capability.memory_mb), network_mode=runbook.network.mode, + tap_name=getattr(runbook.network, "tap_name", ""), network_cidr=runbook.network.consomme_cidr, serial_mode=runbook.serial.mode, serial_path=node_context.console_log_file_path, @@ -111,54 +175,731 @@ def launch(self, node: "OpenVmmGuestNode", log: Logger) -> None: node_context.process_id = openvmm.launch_vm( launch_config, cwd=launch_cwd, - sudo=False, + sudo=runbook.network.mode == OPENVMM_NETWORK_MODE_TAP, ) - self._ensure_process_running(node_context) + self._ensure_process_running(node_context, runbook.network) log.debug( f"Launched OpenVMM VM '{node_context.vm_name}' with pid " f"{node_context.process_id}" ) + def create_node_cloud_init_iso(self, node: "OpenVmmGuestNode") -> None: + runbook = cast(OpenVmmGuestNodeSchema, node.runbook) + node_context = get_node_context(node) + + user: dict[str, Any] = { + "name": runbook.username, + "shell": "/bin/bash", + "sudo": ["ALL=(ALL) NOPASSWD:ALL"], + "groups": ["sudo"], + } + if runbook.private_key_file: + user["ssh_authorized_keys"] = [ + get_public_key_data(runbook.private_key_file) + ] + + user_data: dict[str, Any] = { + "users": ["default", user], + } + if runbook.username == "root": + user_data["disable_root"] = False + if runbook.password: + user["lock_passwd"] = False + user["plain_text_passwd"] = runbook.password + user_data["ssh_pwauth"] = True + + for extra_user_data in node_context.extra_cloud_init_user_data: + for key, value in extra_user_data.items(): + existing_value = user_data.get(key) + if not existing_value: + user_data[key] = value + elif isinstance(existing_value, dict) and isinstance(value, dict): + existing_value.update(value) + elif isinstance(existing_value, list) and isinstance(value, list): + existing_value.extend(value) + else: + user_data[key] = value + + meta_data = { + "instance-id": f"{node_context.vm_name}-{uuid.uuid4().hex}", + "local-hostname": node_context.vm_name, + } + + user_data_string = "#cloud-config\n" + yaml.safe_dump(user_data) + meta_data_string = yaml.safe_dump(meta_data) + + tmp_dir = tempfile.TemporaryDirectory() + try: + iso_path = os.path.join(tmp_dir.name, "cloud-init.iso") + self._create_iso( + iso_path, + [ + ("/user-data", user_data_string), + ("/meta-data", meta_data_string), + ], + ) + self.host_node.shell.copy( + Path(iso_path), + self.host_node.get_pure_path(node_context.cloud_init_file_path), + ) + finally: + tmp_dir.cleanup() + + def _create_iso(self, file_path: str, files: List[tuple[str, str]]) -> None: + import pycdlib + + iso = pycdlib.PyCdlib() + iso_created = False + try: + iso.new(joliet=3, vol_ident="cidata") + iso_created = True + + for index, (path, contents) in enumerate(files): + contents_data = contents.encode() + iso.add_fp( + io.BytesIO(contents_data), + len(contents_data), + f"/{index}.;1", + joliet_path=path, + ) + + iso.write(file_path) + finally: + if iso_created: + iso.close() + + def _prepare_tap_network( + self, + network: OpenVmmNetworkSchema, + node_context: NodeContext, + ) -> None: + if network.mode != OPENVMM_NETWORK_MODE_TAP: + return + + tap_name = network.tap_name + bridge_name = network.bridge_name + host = self.host_node + ip_tool = host.tools[Ip] + host_interface_name = _get_tap_host_interface_name(network) + tap_gateway, dhcp_range = self._get_tap_network_config(network) + + if bridge_name: + self._disable_bridge_netfilter(node_context) + + if not ip_tool.nic_exists(bridge_name): + ip_tool.create_virtual_interface(bridge_name, "bridge") + node_context.tap_bridge_created = True + host.execute( + f"ip link set dev {shlex.quote(bridge_name)} type bridge stp_state 0", + shell=True, + sudo=True, + expected_exit_code=0, + expected_exit_code_failure_message=( + f"failed to disable STP on OpenVMM bridge {bridge_name}" + ), + ) + host.execute( + ( + "ip link set dev " + f"{shlex.quote(bridge_name)} type bridge forward_delay 0" + ), + shell=True, + sudo=True, + expected_exit_code=0, + expected_exit_code_failure_message=( + f"failed to set bridge forward delay on {bridge_name}" + ), + ) + host.execute( + ( + f"ip addr replace {shlex.quote(network.tap_host_cidr)} " + f"dev {shlex.quote(bridge_name)}" + ), + shell=True, + sudo=True, + expected_exit_code=0, + expected_exit_code_failure_message=( + f"failed to configure OpenVMM bridge interface {bridge_name}" + ), + ) + ip_tool.up(bridge_name) + + if not ip_tool.nic_exists(tap_name): + whoami_result = host.execute( + "whoami", + shell=True, + no_info_log=True, + no_error_log=True, + expected_exit_code=0, + expected_exit_code_failure_message=( + "failed to determine the host username with 'whoami' before " + f"creating OpenVMM tap interface {tap_name}. Verify that " + "'whoami' is available and working on the host." + ), + ) + username = whoami_result.stdout.strip() + if not username: + raise LisaException( + "failed to determine the host username before creating " + f"OpenVMM tap interface {tap_name}: 'whoami' returned an " + "empty username. Verify that the host shell environment is " + "configured correctly and that 'whoami' returns a valid user." + ) + host.execute( + ( + f"ip tuntap add {shlex.quote(tap_name)} mode tap " + f"user {shlex.quote(username)}" + ), + shell=True, + sudo=True, + expected_exit_code=0, + expected_exit_code_failure_message=( + f"failed to create OpenVMM tap interface {tap_name}" + ), + ) + node_context.tap_created = True + + if bridge_name: + ip_tool.set_master(tap_name, bridge_name) + + if not bridge_name: + host.execute( + ( + f"ip addr replace {shlex.quote(network.tap_host_cidr)} " + f"dev {shlex.quote(tap_name)}" + ), + shell=True, + sudo=True, + expected_exit_code=0, + expected_exit_code_failure_message=( + f"failed to configure OpenVMM tap interface {tap_name}" + ), + ) + ip_tool.up(tap_name) + + if network.address_mode != OPENVMM_ADDRESS_MODE_STATIC: + self._ensure_tap_dhcp_input_allowed(host_interface_name, node_context) + pid_file = f"/var/run/qemu-dnsmasq-{host_interface_name}.pid" + lease_file = f"/var/run/qemu-dnsmasq-{host_interface_name}.leases" + host.execute( + ( + f"test -f {shlex.quote(pid_file)} && " + f"kill $(cat {shlex.quote(pid_file)}) || true; " + f"rm -f {shlex.quote(pid_file)}; " + f"cp /dev/null {shlex.quote(lease_file)}" + ), + shell=True, + sudo=True, + expected_exit_code=0, + expected_exit_code_failure_message=( + "failed to reset OpenVMM dnsmasq state before starting " + f"DHCP on interface {host_interface_name}" + ), + ) + host.tools[Dnsmasq].start( + host_interface_name, + tap_gateway, + dhcp_range, + stop_firewall=False, + kill_existing=False, + pid_file=pid_file, + lease_file=lease_file, + ) + node_context.tap_dnsmasq_pid_file = pid_file + node_context.tap_dnsmasq_lease_file = lease_file + + self._log_tap_network_state(network, node_context) + if node_context.tap_dnsmasq_pid_file: + self._log_dnsmasq_state(node_context) + + def _disable_bridge_netfilter(self, node_context: NodeContext) -> None: + host = self.host_node + host_context = get_host_context(host) + modprobe = host.tools[Modprobe] + if modprobe.module_exists("br_netfilter") and not modprobe.is_module_loaded( + "br_netfilter", force_run=True + ): + modprobe.load("br_netfilter") + + if host_context.active_bridge_netfilter_count > 0: + host_context.active_bridge_netfilter_count += 1 + node_context.tap_bridge_netfilter_disabled = True + return + + original_values = {} + for key in OPENVMM_BRIDGE_NETFILTER_KEYS: + value_result = host.execute( + f"sysctl -n {shlex.quote(key)}", + shell=True, + sudo=True, + no_info_log=True, + no_error_log=True, + expected_exit_code=None, + ) + if value_result.exit_code == 0: + original_values[key] = value_result.stdout.strip() + + if not original_values: + return + + host_context.original_bridge_netfilter_values = original_values + host_context.active_bridge_netfilter_count = 1 + node_context.tap_bridge_netfilter_disabled = True + try: + self._set_bridge_netfilter_values( + {key: "0" for key in original_values}, + failure_message=( + "failed to disable bridge netfilter on the OpenVMM host" + ), + ) + except Exception: + try: + self._set_bridge_netfilter_values( + original_values, + failure_message=( + "failed to roll back bridge netfilter after an OpenVMM " + "setup error" + ), + ) + except Exception as cleanup_identifier: + self._log.debug( + "failed to roll back bridge netfilter after setup error: " + f"{cleanup_identifier}" + ) + host_context.original_bridge_netfilter_values = {} + host_context.active_bridge_netfilter_count = 0 + node_context.tap_bridge_netfilter_disabled = False + raise + + def _set_bridge_netfilter_values( + self, + values: Dict[str, str], + failure_message: str, + ) -> None: + for key, value in values.items(): + self.host_node.execute( + f"sysctl -w {shlex.quote(f'{key}={value}')}", + shell=True, + sudo=True, + expected_exit_code=0, + expected_exit_code_failure_message=failure_message, + ) + + def _ensure_tap_dhcp_input_allowed( + self, host_interface_name: str, node_context: NodeContext + ) -> None: + iptables_exists = self.host_node.execute( + "command -v iptables >/dev/null 2>&1", + shell=True, + sudo=True, + no_info_log=True, + no_error_log=True, + expected_exit_code=None, + ) + if iptables_exists.exit_code != 0: + return + + rule = ( + f"INPUT -i {shlex.quote(host_interface_name)} -p udp -m udp " + f"--dport {OPENVMM_DHCP_SERVER_PORT} -j ACCEPT" + ) + check_result = self.host_node.execute( + f"iptables -C {rule}", + shell=True, + sudo=True, + no_info_log=True, + no_error_log=True, + expected_exit_code=None, + ) + if check_result.exit_code == 0: + return + + self.host_node.execute( + f"iptables -I {rule}", + shell=True, + sudo=True, + expected_exit_code=0, + expected_exit_code_failure_message=( + "failed to allow DHCP traffic to the OpenVMM host interface" + ), + ) + node_context.tap_dhcp_input_rule_added = True + + def _get_tap_network_config(self, network: OpenVmmNetworkSchema) -> tuple[str, str]: + host_interface = ipaddress.ip_interface(network.tap_host_cidr) + guest_ip = network.guest_address + if not guest_ip: + for address in host_interface.network.hosts(): + if address != host_interface.ip: + guest_ip = str(address) + break + + if not guest_ip: + raise LisaException( + "failed to derive a guest IP for OpenVMM tap networking from " + f"'{network.tap_host_cidr}'. Provide network.guest_address." + ) + + return str(host_interface.ip), f"{guest_ip},{guest_ip}" + def configure_connection(self, node: RemoteNode, log: Logger) -> None: runbook = cast(OpenVmmGuestNodeSchema, node.runbook) + network = runbook.network + node_context = get_node_context(node) + + guest_address = self._resolve_guest_address(node_context, network, log) + node_context.guest_address = guest_address + + address = guest_address + public_address = network.connection_address or guest_address + port = network.ssh_port + public_port = port + + if network.forward_ssh_port: + self._enable_ssh_forwarding(node_context, guest_address, network) + public_address = ( + network.connection_address or self._get_host_public_address() + ) + public_port = network.forwarded_port + node.set_connection_info( - address=runbook.network.connection_address, - public_address=runbook.network.connection_address, + address=address, + public_address=public_address, username=runbook.username, password=runbook.password, private_key_file=runbook.private_key_file, - port=runbook.network.ssh_port, - public_port=runbook.network.ssh_port, + port=port, + public_port=public_port, ) try: is_ready, error_code = wait_tcp_port_ready( - runbook.network.connection_address, - runbook.network.ssh_port, + public_address, + public_port, log=log, timeout=OPENVMM_CONNECTION_TIMEOUT, ) except LisaException as identifier: raise LisaException( "OpenVMM guest SSH port readiness check failed for " - f"{runbook.network.connection_address}:{runbook.network.ssh_port}. " + f"{public_address}:{public_port}. " "Verify the guest is running, port forwarding or network " "configuration is correct, the SSH service is listening on the " "expected port, and review the OpenVMM guest and host logs for " - "startup or networking errors." + "startup or networking errors. " + f"{self._get_openvmm_failure_context(node_context, runbook.network)}" ) from identifier if not is_ready: raise LisaException( "OpenVMM guest SSH port did not become reachable at " - f"{runbook.network.connection_address}:{runbook.network.ssh_port} " + f"{public_address}:{public_port} " f"(error code: {error_code}). Verify the guest is running, " "port forwarding or network configuration is correct, the SSH " "service is listening on the expected port, and review the " - "OpenVMM guest and host logs for startup or networking errors." + "OpenVMM guest and host logs for startup or networking errors. " + f"{self._get_openvmm_failure_context(node_context, runbook.network)}" ) + def _resolve_guest_address( + self, + node_context: Any, + network: OpenVmmNetworkSchema, + log: Logger, + ) -> str: + if network.mode == OPENVMM_NETWORK_MODE_USER: + return network.connection_address or self._get_host_public_address() + + if network.address_mode == OPENVMM_ADDRESS_MODE_STATIC: + return StaticAddressResolver().resolve( + self.host_node, node_context, network, log + ) + elif network.mode == OPENVMM_NETWORK_MODE_TAP: + return self._get_tap_guest_address(node_context, network, log) + else: + raise LisaException( + "address discovery is supported only for tap networking. " + "Use address_mode 'static' for other network modes." + ) + + def _get_tap_guest_address( + self, + node_context: Any, + network: OpenVmmNetworkSchema, + log: Logger, + ) -> str: + _, dhcp_range = self._get_tap_network_config(network) + guest_address = dhcp_range.split(",", maxsplit=1)[0].strip() + if not guest_address: + raise LisaException( + "failed to derive the OpenVMM guest IP from " + f"'{network.tap_host_cidr}'" + ) + self._wait_for_tap_lease(node_context, guest_address, log, network) + return guest_address + + def _wait_for_tap_lease( + self, + node_context: Any, + guest_address: str, + log: Logger, + network: Optional[OpenVmmNetworkSchema] = None, + timeout: int = OPENVMM_IP_DISCOVERY_TIMEOUT, + ) -> None: + lease_file = node_context.tap_dnsmasq_lease_file + if not lease_file: + raise LisaException( + "OpenVMM TAP DHCP lease tracking is not configured. " + "dnsmasq lease file path was not recorded." + ) + + def _lease_is_ready() -> bool: + result = self.host_node.execute( + ( + f"test -f {shlex.quote(lease_file)} && " + f"cat {shlex.quote(lease_file)} || true" + ), + shell=True, + sudo=True, + no_info_log=True, + no_error_log=True, + expected_exit_code=0, + ) + for lease_line in result.stdout.splitlines(): + lease_fields = lease_line.split() + if len(lease_fields) >= 3 and lease_fields[2] == guest_address: + log.debug( + "confirmed OpenVMM guest DHCP lease " + f"'{guest_address}' in {lease_file}" + ) + return True + if not self._is_process_running(node_context.process_id): + raise LisaException( + "OpenVMM process exited before the guest acquired the expected " + f"DHCP lease '{guest_address}'. " + f"{self._get_openvmm_failure_context(node_context, network)}" + ) + return False + + try: + check_till_timeout( + _lease_is_ready, + timeout_message=( + "wait for OpenVMM guest DHCP lease " + f"'{guest_address}' in '{lease_file}'" + ), + timeout=timeout, + ) + except LisaTimeoutException as identifier: + raise LisaException( + "OpenVMM guest did not acquire the expected DHCP lease " + f"'{guest_address}' on '{lease_file}'. " + f"{self._get_openvmm_failure_context(node_context, None)}" + ) from identifier + + def _get_openvmm_failure_context( + self, + node_context: Any, + network: Optional[OpenVmmNetworkSchema], + ) -> str: + details: list[str] = [] + + self._log_tap_network_state(network, node_context, log_commands=True) + self._log_dnsmasq_state(node_context, log_commands=True) + self._log_process_state(node_context, log_commands=True) + self._log_forwarding_state(node_context, network, log_commands=True) + + if node_context.tap_dnsmasq_lease_file: + lease_result = self.host_node.execute( + ( + f"test -f {shlex.quote(node_context.tap_dnsmasq_lease_file)} && " + "tail -n " + f"{OPENVMM_LOG_TAIL_LINES} " + f"{shlex.quote(node_context.tap_dnsmasq_lease_file)} || true" + ), + shell=True, + sudo=True, + no_info_log=False, + expected_exit_code=0, + ) + lease_output = lease_result.stdout.strip() + details.append( + "lease tail: " + (lease_output if lease_output else "") + ) + + for label, path in [ + ("console tail", node_context.console_log_file_path), + ("launcher tail", node_context.launcher_log_file_path), + ]: + if not path: + continue + result = self.host_node.execute( + ( + f"test -f {shlex.quote(path)} && " + f"tail -n {OPENVMM_LOG_TAIL_LINES} {shlex.quote(path)} || true" + ), + shell=True, + sudo=True, + no_info_log=False, + expected_exit_code=0, + ) + output = result.stdout.strip() + details.append(f"{label}: " + (output if output else "")) + + return " | ".join(details) + + def _log_tap_network_state( + self, + network: Optional[OpenVmmNetworkSchema], + node_context: Any, + log_commands: bool = False, + ) -> None: + if not network or network.mode != OPENVMM_NETWORK_MODE_TAP: + return + + host_interface = _get_tap_host_interface_name(network) + commands = [ + ( + "host interface addr", + f"ip addr show dev {shlex.quote(host_interface)} 2>/dev/null || true", + ), + ( + "host interface link", + f"ip link show dev {shlex.quote(host_interface)} 2>/dev/null || true", + ), + ( + "tap link", + f"ip link show dev {shlex.quote(network.tap_name)} 2>/dev/null || true", + ), + ] + if network.bridge_name: + commands.append( + ( + "bridge members", + "bridge link show master " + f"{shlex.quote(network.bridge_name)} 2>/dev/null || true", + ) + ) + + self._log_command_outputs( + "tap network state", + commands, + log_commands=log_commands, + ) + + def _log_dnsmasq_state(self, node_context: Any, log_commands: bool = False) -> None: + commands: list[tuple[str, str]] = [] + if node_context.tap_dnsmasq_pid_file: + commands.append( + ( + "dnsmasq pid", + ( + f"test -f {shlex.quote(node_context.tap_dnsmasq_pid_file)} && " + f"cat {shlex.quote(node_context.tap_dnsmasq_pid_file)} || true" + ), + ) + ) + if node_context.tap_dnsmasq_lease_file: + commands.append( + ( + "dnsmasq lease tail", + ( + "test -f " + f"{shlex.quote(node_context.tap_dnsmasq_lease_file)} && " + "tail -n " + f"{OPENVMM_LOG_TAIL_LINES} " + f"{shlex.quote(node_context.tap_dnsmasq_lease_file)} || true" + ), + ) + ) + + self._log_command_outputs( + "dnsmasq state", + commands, + log_commands=log_commands, + ) + + def _log_process_state(self, node_context: Any, log_commands: bool = False) -> None: + if not node_context.process_id: + return + + process_id = shlex.quote(node_context.process_id) + commands = [ + ( + "openvmm process status", + "ps -p " + f"{process_id} -o pid=,ppid=,stat=,etime=,cmd= 2>/dev/null || true", + ), + ] + self._log_command_outputs( + "process state", + commands, + log_commands=log_commands, + ) + + def _log_forwarding_state( + self, + node_context: Any, + network: Optional[OpenVmmNetworkSchema], + log_commands: bool = False, + ) -> None: + if not network or not node_context.forwarded_port: + return + + guest_address = str(node_context.guest_address or "") + if not guest_address: + return + match_pattern = shlex.quote(f"{node_context.forwarded_port}|{guest_address}") + commands = [ + ( + "forward filter rules", + ( + "iptables -S FORWARD 2>/dev/null | " + f"grep -E {match_pattern} || true" + ), + ), + ( + "forward nat rules", + ( + "iptables -t nat -S 2>/dev/null | " + f"grep -E {match_pattern} || true" + ), + ), + ] + self._log_command_outputs( + "forwarding state", + commands, + log_commands=log_commands, + ) + + def _log_command_outputs( + self, + section: str, + commands: list[tuple[str, str]], + log_commands: bool = False, + ) -> None: + outputs: list[str] = [] + for label, command in commands: + try: + result = self.host_node.execute( + command, + shell=True, + sudo=True, + no_info_log=not log_commands, + no_error_log=not log_commands, + expected_exit_code=None, + ) + if not log_commands: + output = result.stdout.strip() + outputs.append(f"{label}: {output if output else ''}") + except LisaException as identifier: + outputs.append(f"{label}: ") + + if outputs: + self._log.debug(f"{section}: {' | '.join(outputs)}") + def stop_node(self, node: Node, wait: bool = True) -> None: node_context = get_node_context(node) wait_failure: Optional[LisaException] = None + process_id = node_context.process_id if node.is_connected: node.execute( "shutdown -P now", @@ -169,23 +910,34 @@ def stop_node(self, node: Node, wait: bool = True) -> None: expected_exit_code=None, ) - if wait and node_context.process_id: + if wait and process_id: try: - self._wait_for_process_exit(node_context.process_id) + self._wait_for_process_exit(process_id) except LisaException as identifier: wait_failure = identifier - if node_context.process_id: + if process_id: self.host_node.tools[Kill].by_pid( - node_context.process_id, + process_id, ignore_not_exist=True, ) node_context.process_id = "" + self._disable_ssh_forwarding(node) + self._teardown_tap_network( + node_context, + cast(OpenVmmGuestNodeSchema, node.runbook).network, + ) + if wait_failure: - raise wait_failure + self._log.info( + f"{wait_failure} Forcing OpenVMM process '{process_id}' to stop." + ) def start_node(self, node: "OpenVmmGuestNode", wait: bool = True) -> None: + runbook = cast(OpenVmmGuestNodeSchema, node.runbook) + if runbook.cloud_init: + self.create_node_cloud_init_iso(node) self.launch(node, node.log) if wait: self.configure_connection(node, node.log) @@ -194,6 +946,266 @@ def restart_node(self, node: "OpenVmmGuestNode", wait: bool = True) -> None: self.stop_node(node, wait=wait) self.start_node(node, wait=wait) + def cleanup_node_artifacts(self, node: "OpenVmmGuestNode") -> None: + node_context = get_node_context(node) + if not node_context.working_path: + return + + runbook = cast(OpenVmmGuestNodeSchema, node.runbook) + base_working_path = self.host_node.get_pure_path(runbook.lisa_working_dir) + working_path = self.host_node.get_pure_path(node_context.working_path) + if working_path == base_working_path: + raise LisaException( + "refusing to delete the OpenVMM base working directory " + f"'{working_path}'." + ) + + try: + relative_working_path = working_path.relative_to(base_working_path) + except ValueError as identifier: + raise LisaException( + "refusing to delete OpenVMM working path outside the configured " + f"base directory. Working path: '{working_path}'. Base path: " + f"'{base_working_path}'." + ) from identifier + + if not relative_working_path.parts or any( + part in {"", ".", ".."} for part in relative_working_path.parts + ): + raise LisaException( + "refusing to delete unsafe OpenVMM working path " + f"'{working_path}'. Verify the guest and host names do not " + "contain path traversal segments." + ) + + self.host_node.tools[Rm].remove_directory(str(working_path), sudo=True) + node_context.working_path = "" + node_context.uefi_firmware_path = "" + node_context.disk_img_path = "" + node_context.cloud_init_file_path = "" + node_context.console_log_file_path = "" + node_context.launcher_log_file_path = "" + + def _get_host_public_address(self) -> str: + if self.host_node.is_remote: + return cast(RemoteNode, self.host_node).public_address + return "127.0.0.1" + + def _enable_ssh_forwarding( + self, + node_context: Any, + guest_address: str, + network: OpenVmmNetworkSchema, + ) -> None: + host_context = get_host_context(self.host_node) + forwarding_interface, _ = self.host_node.tools[Ip].get_default_route_info() + host_interface = _get_tap_host_interface_name(network) + host_network = ipaddress.ip_interface(network.tap_host_cidr).network + guest_address = shlex.quote(guest_address) + guest_port = network.ssh_port + forwarded_port = network.forwarded_port + + if host_context.active_forwarding_count == 0: + ip_forward_result = self.host_node.execute( + "sysctl -n net.ipv4.ip_forward", + shell=True, + sudo=True, + no_info_log=True, + no_error_log=True, + expected_exit_code=0, + expected_exit_code_failure_message=( + "failed to read current host ip_forward state for OpenVMM " + "SSH forwarding" + ), + ) + original_ip_forward_value = ip_forward_result.stdout.strip() + if original_ip_forward_value not in ["0", "1"]: + raise LisaException( + "failed to parse current host ip_forward state for " + "OpenVMM SSH forwarding. " + f"stdout: {ip_forward_result.stdout.strip() or ''}. " + f"stderr: {ip_forward_result.stderr.strip() or ''}" + ) + host_context.original_ip_forward_value = original_ip_forward_value + + host_context.active_forwarding_count += 1 + node_context.forwarding_interface = forwarding_interface + node_context.forwarded_port = forwarded_port + node_context.forwarding_enabled = True + + commands = [ + "sysctl -w net.ipv4.ip_forward=1", + ( + "iptables -C FORWARD -i " + f"{shlex.quote(host_interface)} -o {shlex.quote(forwarding_interface)} " + "-j ACCEPT " + "|| " + "iptables -I FORWARD -i " + f"{shlex.quote(host_interface)} -o {shlex.quote(forwarding_interface)} " + "-j ACCEPT" + ), + ( + "iptables -C FORWARD -i " + f"{shlex.quote(forwarding_interface)} -o {shlex.quote(host_interface)} " + "-m state --state RELATED,ESTABLISHED -j ACCEPT " + "|| " + "iptables -I FORWARD -i " + f"{shlex.quote(forwarding_interface)} -o {shlex.quote(host_interface)} " + "-m state --state RELATED,ESTABLISHED -j ACCEPT" + ), + ( + "iptables -C FORWARD -i " + f"{shlex.quote(forwarding_interface)} -o {shlex.quote(host_interface)} " + f"-p tcp -d {guest_address} --dport {guest_port} -j ACCEPT " + "|| " + "iptables -I FORWARD -i " + f"{shlex.quote(forwarding_interface)} -o {shlex.quote(host_interface)} " + f"-p tcp -d {guest_address} --dport {guest_port} -j ACCEPT" + ), + ( + "iptables -t nat -C POSTROUTING -s " + f"{shlex.quote(str(host_network))} " + f"-o {shlex.quote(forwarding_interface)} " + "-j MASQUERADE || " + "iptables -t nat -I POSTROUTING -s " + f"{shlex.quote(str(host_network))} " + f"-o {shlex.quote(forwarding_interface)} " + "-j MASQUERADE" + ), + ( + "iptables -t nat -C PREROUTING -p tcp --dport " + f"{forwarded_port} -j DNAT --to-destination " + f"{guest_address}:{guest_port} " + "|| " + "iptables -t nat -I PREROUTING -p tcp --dport " + f"{forwarded_port} -j DNAT --to-destination " + f"{guest_address}:{guest_port}" + ), + ( + "iptables -t nat -C OUTPUT -p tcp --dport " + f"{forwarded_port} -j DNAT --to-destination " + f"{guest_address}:{guest_port} " + "|| " + "iptables -t nat -I OUTPUT -p tcp --dport " + f"{forwarded_port} -j DNAT --to-destination " + f"{guest_address}:{guest_port}" + ), + ] + try: + for command in commands: + self.host_node.execute( + command, + shell=True, + sudo=True, + expected_exit_code=0, + expected_exit_code_failure_message=( + "failed to configure OpenVMM SSH forwarding" + ), + ) + except Exception: + try: + self._disable_ssh_forwarding_context(node_context, network) + except Exception as cleanup_identifier: + self._log.debug( + "failed to roll back OpenVMM SSH forwarding after setup " + f"error: {cleanup_identifier}" + ) + raise + + def _disable_ssh_forwarding(self, node: Node) -> None: + node_context = get_node_context(node) + self._disable_ssh_forwarding_context( + node_context, + cast(OpenVmmGuestNodeSchema, node.runbook).network, + ) + + def _disable_ssh_forwarding_context( + self, + node_context: Any, + network: OpenVmmNetworkSchema, + ) -> None: + if ( + not node_context.forwarding_enabled + and not node_context.forwarded_port + and not node_context.forwarding_interface + ): + return + + host_context = get_host_context(self.host_node) + guest_address = shlex.quote(node_context.guest_address) + guest_port = node_context.ssh_port + forwarded_port = node_context.forwarded_port + forwarding_interface = node_context.forwarding_interface + host_interface = _get_tap_host_interface_name(network) + host_network = ipaddress.ip_interface(network.tap_host_cidr).network + commands = [ + ( + "iptables -D FORWARD -i " + f"{shlex.quote(host_interface)} -o {shlex.quote(forwarding_interface)} " + "-j ACCEPT || true" + ), + ( + "iptables -D FORWARD -i " + f"{shlex.quote(forwarding_interface)} -o {shlex.quote(host_interface)} " + "-m state --state RELATED,ESTABLISHED -j ACCEPT || true" + ), + ( + "iptables -D FORWARD -i " + f"{shlex.quote(forwarding_interface)} -o {shlex.quote(host_interface)} " + f"-p tcp -d {guest_address} --dport {guest_port} -j ACCEPT || true" + ), + ( + "iptables -t nat -D PREROUTING -p tcp --dport " + f"{forwarded_port} -j DNAT --to-destination " + f"{guest_address}:{guest_port} || true" + ), + ( + "iptables -t nat -D OUTPUT -p tcp --dport " + f"{forwarded_port} -j DNAT --to-destination " + f"{guest_address}:{guest_port} || true" + ), + ( + "iptables -t nat -D POSTROUTING -s " + f"{shlex.quote(str(host_network))} " + f"-o {shlex.quote(forwarding_interface)} " + "-j MASQUERADE || true" + ), + ] + for command in commands: + self.host_node.execute( + command, + shell=True, + sudo=True, + expected_exit_code=0, + expected_exit_code_failure_message=( + "failed to remove OpenVMM SSH forwarding" + ), + ) + + if node_context.forwarding_enabled and host_context.active_forwarding_count > 0: + host_context.active_forwarding_count -= 1 + + if ( + host_context.active_forwarding_count == 0 + and host_context.original_ip_forward_value + ): + self.host_node.execute( + "sysctl -w net.ipv4.ip_forward=" + f"{shlex.quote(host_context.original_ip_forward_value)}", + shell=True, + sudo=True, + expected_exit_code=0, + expected_exit_code_failure_message=( + "failed to restore host ip_forward state after OpenVMM " + "SSH forwarding" + ), + ) + host_context.original_ip_forward_value = "" + + node_context.forwarded_port = 0 + node_context.forwarding_enabled = False + node_context.forwarding_interface = "" + def _wait_for_process_exit(self, process_id: str, timeout: int = 60) -> None: try: check_till_timeout( @@ -208,18 +1220,22 @@ def _wait_for_process_exit(self, process_id: str, timeout: int = 60) -> None: "for details." ) from identifier - def _ensure_process_running(self, node_context: Any, grace_period: int = 2) -> None: - timeout = max(grace_period + 1, 1) + def _ensure_process_running( + self, + node_context: Any, + network: OpenVmmNetworkSchema, + grace_period_seconds: int = 2, + ) -> None: + timeout = max(grace_period_seconds + 1, 1) grace_timer = create_timer() def _process_survived_grace_period() -> bool: if not self._is_process_running(node_context.process_id): raise LisaException( "OpenVMM process exited immediately after launch. " - f"Check {node_context.launcher_log_file_path} on the host " - "for details." + f"{self._get_openvmm_failure_context(node_context, network)}" ) - return grace_timer.elapsed(False) >= grace_period + return grace_timer.elapsed(False) >= grace_period_seconds check_till_timeout( _process_survived_grace_period, @@ -244,6 +1260,80 @@ def _is_process_running(self, process_id: str) -> bool: ) return result.exit_code == 0 + def _teardown_tap_network( + self, + node_context: Any, + network: OpenVmmNetworkSchema, + ) -> None: + if network.mode != OPENVMM_NETWORK_MODE_TAP: + return + + if node_context.tap_dhcp_input_rule_added: + host_interface_name = _get_tap_host_interface_name(network) + self.host_node.execute( + ( + "iptables -D INPUT -i " + f"{shlex.quote(host_interface_name)} -p udp -m udp " + f"--dport {OPENVMM_DHCP_SERVER_PORT} -j ACCEPT || true" + ), + shell=True, + sudo=True, + expected_exit_code=0, + ) + node_context.tap_dhcp_input_rule_added = False + + if node_context.tap_dnsmasq_pid_file: + self.host_node.execute( + ( + f"test -f {shlex.quote(node_context.tap_dnsmasq_pid_file)} && " + "kill $(cat " + f"{shlex.quote(node_context.tap_dnsmasq_pid_file)}) || true" + ), + shell=True, + sudo=True, + expected_exit_code=0, + ) + node_context.tap_dnsmasq_pid_file = "" + node_context.tap_dnsmasq_lease_file = "" + + if node_context.tap_created: + self.host_node.execute( + f"ip link delete {shlex.quote(network.tap_name)} || true", + shell=True, + sudo=True, + expected_exit_code=0, + ) + node_context.tap_created = False + + if node_context.tap_bridge_created and network.bridge_name: + self.host_node.execute( + f"ip link delete {shlex.quote(network.bridge_name)} || true", + shell=True, + sudo=True, + expected_exit_code=0, + ) + node_context.tap_bridge_created = False + + if node_context.tap_bridge_netfilter_disabled: + host_context = get_host_context(self.host_node) + if host_context.active_bridge_netfilter_count > 0: + host_context.active_bridge_netfilter_count -= 1 + + if ( + host_context.active_bridge_netfilter_count == 0 + and host_context.original_bridge_netfilter_values + ): + self._set_bridge_netfilter_values( + host_context.original_bridge_netfilter_values, + failure_message=( + "failed to restore bridge netfilter state on the " + "OpenVMM host" + ), + ) + host_context.original_bridge_netfilter_values = {} + + node_context.tap_bridge_netfilter_disabled = False + class OpenVmmGuestNode(RemoteNode): def __init__( @@ -284,6 +1374,12 @@ def cleanup(self) -> None: self._openvmm_controller.stop_node(self, wait=False) except Exception as identifier: self.log.debug(f"failed to stop OpenVMM guest during cleanup: {identifier}") + try: + self._openvmm_controller.cleanup_node_artifacts(self) + except Exception as identifier: + self.log.debug( + f"failed to clean OpenVMM guest artifacts during cleanup: {identifier}" + ) super().cleanup() def _initialize(self, *args: Any, **kwargs: Any) -> None: @@ -334,6 +1430,16 @@ def _provision(self) -> None: ) ) + if runbook.cloud_init: + node_context.cloud_init_file_path = str(working_path / "cloud-init.iso") + + self._load_extra_cloud_init_user_data( + runbook.cloud_init.extra_user_data, + node_context, + ) + + self._openvmm_controller.create_node_cloud_init_iso(self) + node_context.launcher_log_file_path = str(working_path / "openvmm-launcher.log") node_context.launcher_stderr_log_file_path = str( working_path / "openvmm-launcher.stderr.log" @@ -341,13 +1447,83 @@ def _provision(self) -> None: node_context.console_log_file_path = str(working_path / "openvmm-console.log") node_context.ssh_port = runbook.network.ssh_port - if runbook.network.mode != OPENVMM_NETWORK_MODE_USER: + self._openvmm_controller.launch(self, self.log) + self._openvmm_controller.configure_connection(self, self.log) + + def _resolve_extra_user_data_file(self, relative_file_path: str) -> Path: + root_path = constants.RUNBOOK_PATH.resolve().absolute() + posix_path = PurePosixPath(relative_file_path) + windows_path = PureWindowsPath(relative_file_path) + + if ( + posix_path.is_absolute() + or windows_path.is_absolute() + or windows_path.drive + or windows_path.root + ): raise LisaException( - "base OpenVMM orchestrator support requires user-mode networking" + "cloud-init extra_user_data file path must be relative to the " + f"runbook directory: '{relative_file_path}'" ) - self._openvmm_controller.launch(self, self.log) - self._openvmm_controller.configure_connection(self, self.log) + file_path = root_path.joinpath(relative_file_path).resolve() + try: + file_path.relative_to(root_path) + except ValueError as identifier: + raise LisaException( + "cloud-init extra_user_data file path " + f"'{relative_file_path}' escapes the runbook directory " + f"'{root_path}'. Use a relative path under the runbook directory." + ) from identifier + + return file_path + + def _load_extra_cloud_init_user_data( + self, + extra_user_data: Optional[Any], + node_context: Any, + ) -> None: + if not extra_user_data: + return + + if isinstance(extra_user_data, str): + extra_user_data = [extra_user_data] + + for relative_file_path in extra_user_data: + if not relative_file_path: + continue + + file_path = self._resolve_extra_user_data_file(relative_file_path) + try: + with open(file_path, "r", encoding="utf-8") as file: + file_content = file.read() + except OSError as identifier: + raise LisaException( + "failed to read cloud-init extra_user_data file " + f"'{relative_file_path}' resolved to '{file_path}'. " + "Verify the file exists under the runbook directory and is " + "readable." + ) from identifier + + try: + loaded_user_data = yaml.safe_load(file_content) + except yaml.YAMLError as identifier: + raise LisaException( + "failed to parse cloud-init extra_user_data file " + f"'{file_path}'. Verify the file contains valid YAML " + "mapping content that cloud-init can merge. " + f"Parse error: {identifier}" + ) from identifier + + if not isinstance(loaded_user_data, dict): + raise LisaException( + "invalid cloud-init extra_user_data file " + f"'{file_path}': expected a YAML mapping/dictionary, but got " + f"{type(loaded_user_data).__name__}. Update the file to " + "contain key/value pairs that cloud-init can merge." + ) + + node_context.extra_cloud_init_user_data.append(loaded_user_data) def _initialize_capability(self) -> None: if not self.capability.features: diff --git a/lisa/sut_orchestrator/openvmm/schema.py b/lisa/sut_orchestrator/openvmm/schema.py index 148fb8b0f4..9ecae5b8a9 100644 --- a/lisa/sut_orchestrator/openvmm/schema.py +++ b/lisa/sut_orchestrator/openvmm/schema.py @@ -1,8 +1,9 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. +import ipaddress from dataclasses import dataclass, field -from typing import List, Optional +from typing import List, Optional, Union from dataclasses_json import config, dataclass_json @@ -13,11 +14,20 @@ from .. import OPENVMM OPENVMM_BOOT_MODE_UEFI = "uefi" +OPENVMM_ADDRESS_MODE_DISCOVER = "discover" +OPENVMM_ADDRESS_MODE_STATIC = "static" OPENVMM_NETWORK_MODE_USER = "user" +OPENVMM_NETWORK_MODE_TAP = "tap" OPENVMM_SERIAL_MODE_STDERR = "stderr" OPENVMM_SERIAL_MODE_FILE = "file" +@dataclass_json() +@dataclass +class CloudInitSchema: + extra_user_data: Optional[Union[str, List[str]]] = None + + @dataclass_json() @dataclass class OpenVmmInstallerSchema(schema.TypedSchema, schema.ExtendableSchemaMixin): @@ -68,6 +78,11 @@ def __post_init__(self) -> None: @dataclass class OpenVmmNetworkSchema: mode: str = OPENVMM_NETWORK_MODE_USER + address_mode: str = OPENVMM_ADDRESS_MODE_DISCOVER + tap_name: str = "" + bridge_name: str = "" + tap_host_cidr: str = "10.0.0.1/24" + guest_address: str = "" connection_address: str = "" consomme_cidr: str = "" ssh_port: int = field( @@ -77,16 +92,86 @@ class OpenVmmNetworkSchema: validate=schema.validate.Range(min=1, max=65535), ), ) + forward_ssh_port: bool = False + forwarded_port: int = field( + default=0, + metadata=schema.field_metadata( + field_function=schema.fields.Int, + validate=schema.validate.Range(min=0, max=65535), + ), + ) + + def _validate_tap_host_cidr(self) -> None: + if self.mode != OPENVMM_NETWORK_MODE_TAP: + return + + if not self.tap_host_cidr: + raise LisaException("tap_host_cidr is required when network mode is 'tap'") + + try: + ipaddress.ip_interface(self.tap_host_cidr) + except ValueError as identifier: + raise LisaException( + "tap_host_cidr " + f"'{self.tap_host_cidr}' is invalid for OpenVMM tap networking. " + "Use an interface CIDR like '10.0.0.1/24'." + ) from identifier def __post_init__(self) -> None: - if self.mode != OPENVMM_NETWORK_MODE_USER: + if self.mode not in [ + OPENVMM_NETWORK_MODE_USER, + OPENVMM_NETWORK_MODE_TAP, + ]: raise LisaException( - f"network mode '{self.mode}' is not supported. " - f"Supported values: {OPENVMM_NETWORK_MODE_USER}" + f"network mode '{self.mode}' is not supported for OpenVMM guests. " + f"Supported values: {OPENVMM_NETWORK_MODE_USER}, " + f"{OPENVMM_NETWORK_MODE_TAP}" ) - if not self.connection_address: + if self.mode == OPENVMM_NETWORK_MODE_TAP and not self.tap_name: + raise LisaException("tap_name is required when network mode is 'tap'") + self._validate_tap_host_cidr() + if self.address_mode not in [ + OPENVMM_ADDRESS_MODE_DISCOVER, + OPENVMM_ADDRESS_MODE_STATIC, + ]: raise LisaException( - "connection_address is required for OpenVMM guest networking" + f"address_mode '{self.address_mode}' is not supported. " + f"Supported values: {OPENVMM_ADDRESS_MODE_DISCOVER}, " + f"{OPENVMM_ADDRESS_MODE_STATIC}" + ) + + if self.forward_ssh_port: + if self.mode != OPENVMM_NETWORK_MODE_TAP: + raise LisaException( + "forward_ssh_port is supported only with tap networking" + ) + if ( + self.address_mode == OPENVMM_ADDRESS_MODE_STATIC + and not self.guest_address + ): + raise LisaException( + "guest_address is required when forward_ssh_port is enabled" + ) + if self.forwarded_port <= 0 or self.forwarded_port > 65535: + raise LisaException( + "forwarded_port must be between 1 and 65535 when " + "forward_ssh_port is enabled" + ) + + if self.mode == OPENVMM_NETWORK_MODE_USER: + return + + if ( + self.address_mode == OPENVMM_ADDRESS_MODE_DISCOVER + and self.mode != OPENVMM_NETWORK_MODE_TAP + ): + raise LisaException( + "address_mode 'discover' is supported only with tap networking" + ) + + if self.address_mode == OPENVMM_ADDRESS_MODE_STATIC and not self.guest_address: + raise LisaException( + "guest_address is required when address_mode is 'static'" ) @@ -99,6 +184,7 @@ class OpenVmmGuestNodeSchema(schema.GuestNode): default="", repr=False, metadata=config(exclude=lambda x: True) ) private_key_file: str = "" + cloud_init: Optional[CloudInitSchema] = None lisa_working_dir: str = "/var/tmp" boot_mode: str = OPENVMM_BOOT_MODE_UEFI uefi: Optional[OpenVmmUefiSchema] = None @@ -124,3 +210,13 @@ def __post_init__(self) -> None: ) if not self.disk_img: raise LisaException("disk_img is required for UEFI OpenVMM guests") + if ( + self.cloud_init + and not self.private_key_file + and not self.password + and not self.cloud_init.extra_user_data + ): + raise LisaException( + "OpenVMM cloud_init requires private_key_file, password, or " + "cloud_init.extra_user_data to provision guest access" + ) diff --git a/lisa/tools/dnsmasq.py b/lisa/tools/dnsmasq.py index ae34531673..a163409d57 100644 --- a/lisa/tools/dnsmasq.py +++ b/lisa/tools/dnsmasq.py @@ -1,6 +1,8 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. +import shlex + from lisa.executable import Tool from lisa.operating_system import Posix from lisa.tools.firewall import Firewall @@ -26,23 +28,40 @@ def start( nic_name: str, gateway: str, dhcp_range: str, + stop_firewall: bool = True, + kill_existing: bool = True, + pid_file: str = "", + lease_file: str = "", ) -> None: - # stop firewall - self.node.tools[Firewall].stop() + if stop_firewall: + # stop firewall + self.node.tools[Firewall].stop() + + if kill_existing: + # kill dnsmasq if it is running + kill = self.node.tools[Kill] + kill.by_name("dnsmasq") - # kill dnsmasq if it is running - kill = self.node.tools[Kill] - kill.by_name("dnsmasq") + if not pid_file: + pid_file = f"/var/run/qemu-dnsmasq-{nic_name}.pid" + if not lease_file: + lease_file = f"/var/run/qemu-dnsmasq-{nic_name}.leases" # setup dnsmasq on interface `nic_name` and listen on `nic_address` # assign dhcp address in `dhcp_range` - cmd = ( - "--strict-order --except-interface=lo " - f"--interface={nic_name} --listen-address={gateway} --bind-interfaces " - f"--dhcp-range={dhcp_range} --conf-file= " - f"--pid-file=/var/run/qemu-dnsmasq-{nic_name}.pid " - f"--dhcp-leasefile=/var/run/qemu-dnsmasq-{nic_name}.leases " - "--dhcp-no-override " + cmd = shlex.join( + [ + "--strict-order", + "--except-interface=lo", + f"--interface={nic_name}", + f"--listen-address={gateway}", + "--bind-interfaces", + f"--dhcp-range={dhcp_range}", + "--conf-file=", + f"--pid-file={pid_file}", + f"--dhcp-leasefile={lease_file}", + "--dhcp-no-override", + ] ) # start dnsmasq diff --git a/lisa/tools/openvmm.py b/lisa/tools/openvmm.py index 99279755bd..06f0cc4d48 100644 --- a/lisa/tools/openvmm.py +++ b/lisa/tools/openvmm.py @@ -151,9 +151,16 @@ def launch_vm( no_info_log=True, cwd=cwd, ) - pid = result.stdout.strip() - if not pid: - raise LisaException("OpenVMM launch did not return a PID") + pid_lines = [ + line.strip() for line in result.stdout.splitlines() if line.strip() + ] + pid = pid_lines[-1] if pid_lines else "" + if not pid or not pid.isdigit(): + raise LisaException( + "OpenVMM launch did not return a valid PID. " + f"stdout: {result.stdout.strip() or ''}. " + f"stderr: {result.stderr.strip() or ''}" + ) return pid def _build_launch_shell_command( @@ -163,4 +170,34 @@ def _build_launch_shell_command( if PurePath(config.stdout_path) == PurePath(config.stderr_path): return f"nohup {command} > {stdout_path} 2>&1 < /dev/null & echo $!" stderr_path = shlex.quote(config.stderr_path) - return f"nohup {command} > {stdout_path} 2> {stderr_path} < /dev/null & echo $!" + pid_path = shlex.quote(f"{config.stdout_path}.pid") + inner_command = shlex.quote(f"echo $$ > {pid_path}; exec {command}") + wrapped_command = shlex.quote(f"sh -c {inner_command}") + pty_command = shlex.quote( + f"tail -f /dev/null | script -qefc {wrapped_command} /dev/null" + ) + + # OpenVMM's management loop expects a tty for its stdio thread. Feed an + # always-open empty stream into script so detached launches behave like + # an interactive session instead of exiting on immediate stdin EOF. The + # script wrapper records the exec'd OpenVMM PID so later liveness checks + # and forced cleanup target the VM process rather than the wrapper shell. + return ( + "if command -v script >/dev/null 2>&1; then " + f"rm -f {pid_path}; " + f"nohup sh -c {pty_command} > {stdout_path} " + f"2> {stderr_path} < /dev/null & wrapper_pid=$!; " + "attempt=0; " + "while [ $attempt -lt 100 ]; do " + f"if [ -s {pid_path} ]; then cat {pid_path}; exit 0; fi; " + "if ! kill -0 $wrapper_pid >/dev/null 2>&1; then break; fi; " + "attempt=$((attempt + 1)); " + "sleep 0.1; " + "done; " + "echo 'OpenVMM launch did not record a child PID from the " + "script wrapper.' >&2; " + "exit 1; " + "else " + f"nohup {command} > {stdout_path} 2> {stderr_path} < /dev/null & echo $!; " + "fi" + ) diff --git a/selftests/test_openvmm_node.py b/selftests/test_openvmm_node.py index a658fb7d6c..0f03e6d742 100644 --- a/selftests/test_openvmm_node.py +++ b/selftests/test_openvmm_node.py @@ -20,21 +20,24 @@ class OpenVmmNodeTestCase(TestCase): - def _create_controller(self) -> Tuple[OpenVmmController, MagicMock, MagicMock]: + def _create_controller( + self, + ) -> Tuple[OpenVmmController, MagicMock, MagicMock, MagicMock]: shell_copy = MagicMock() kill_by_pid = MagicMock() + guest_log = MagicMock() host_node = SimpleNamespace( is_remote=True, get_pure_path=PurePosixPath, shell=SimpleNamespace(copy=shell_copy), tools={Kill: SimpleNamespace(by_pid=kill_by_pid)}, ) - guest_node = SimpleNamespace(parent=host_node, log=MagicMock()) + guest_node = SimpleNamespace(parent=host_node, log=guest_log) controller = OpenVmmController(cast(Any, guest_node)) - return controller, shell_copy, kill_by_pid + return controller, shell_copy, kill_by_pid, guest_log def test_resolve_guest_artifact_path_uses_unique_names(self) -> None: - controller, shell_copy, _ = self._create_controller() + controller, shell_copy, _, _ = self._create_controller() with TemporaryDirectory() as temp_dir: root = Path(temp_dir) first_dir = root / "first" @@ -61,8 +64,11 @@ def test_resolve_guest_artifact_path_uses_unique_names(self) -> None: self.assertEqual(2, shell_copy.call_count) def test_stop_node_kills_process_after_wait_timeout(self) -> None: - controller, _, kill_by_pid = self._create_controller() - node = SimpleNamespace(is_connected=False) + controller, _, kill_by_pid, guest_log = self._create_controller() + node = SimpleNamespace( + is_connected=False, + runbook=SimpleNamespace(network=OpenVmmNetworkSchema()), + ) node_context = NodeContext(process_id="1234") with patch( @@ -73,17 +79,19 @@ def test_stop_node_kills_process_after_wait_timeout(self) -> None: "_wait_for_process_exit", side_effect=LisaException("timeout"), ): - with self.assertRaises(LisaException): - controller.stop_node(cast(Any, node), wait=True) + controller.stop_node(cast(Any, node), wait=True) kill_by_pid.assert_called_once_with( "1234", ignore_not_exist=True, ) self.assertEqual("", node_context.process_id) + guest_log.info.assert_called_once_with( + "timeout Forcing OpenVMM process '1234' to stop." + ) def test_launch_uses_host_pure_path_for_cwd(self) -> None: - controller, _, _ = self._create_controller() + controller, _, _, _ = self._create_controller() openvmm = MagicMock() openvmm.build_command.return_value = "openvmm --uefi" openvmm.launch_vm.return_value = "1234"