Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions internal/controller/device/vpci/doc.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
//go:build windows

// Package vpci provides a controller for managing virtual PCI (vPCI) device
// assignments on a Utility VM (UVM). It handles assigning and removing
// PCI devices from the UVM via HCS modify calls.
//
// # Lifecycle
//
// [Controller] tracks active device assignments by VMBus GUID (device identifier
// within UVM) in an internal map. Each assignment is reference-counted to
// support shared access by multiple callers.
//
// - [Controller.Reserve] generates a unique VMBus GUID for a device and
// records the reservation. If the same device is already reserved, the
// existing GUID is returned.
// - [Controller.AddToVM] assigns a previously reserved device to the VM
// using the VMBus GUID returned by Reserve. If the device is already
// assigned, the reference count is incremented and the call succeeds
// without a second host-side assignment.
// - [Controller.RemoveFromVM] decrements the reference count for the device
// identified by VMBus GUID. When it reaches zero, the device is removed
// from the VM. It also handles cleanup for devices that were reserved
// but never assigned.
//
// # Invalid Devices
//
// If the host-side assignment succeeds but the guest-side notification fails,
// the device is marked invalid. It remains tracked so that the caller can call
// [Controller.RemoveFromVM] to perform host-side cleanup.
//
// # Virtual Functions
//
// Each Virtual Function is assigned as an independent guest device with its own
// VMBus GUID. Multiple Virtual Functions on the same physical device are treated
// as separate devices in the guest.
//
// # Guest Requests
//
// On LCOW, assigning a vPCI device requires a guest-side notification so the
// GCS can wait for the required device paths to become available.
// WCOW does not require a guest request as part of device assignment.
package vpci
61 changes: 61 additions & 0 deletions internal/controller/device/vpci/types.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
//go:build windows

package vpci

import (
"context"

"github.com/Microsoft/go-winio/pkg/guid"

hcsschema "github.com/Microsoft/hcsshim/internal/hcs/schema2"
"github.com/Microsoft/hcsshim/internal/protocol/guestresource"
)

// Device holds the configuration required to assign a vPCI device to the VM.
type Device struct {
// DeviceInstanceID is the host device instance path of the vPCI device.
DeviceInstanceID string

// VirtualFunctionIndex is the SR-IOV virtual function index to assign.
VirtualFunctionIndex uint16
}

// vmVPCI manages adding and removing vPCI devices for a Utility VM.
// Implemented by [vmmanager.UtilityVM].
type vmVPCI interface {
// AddDevice adds a vPCI device identified by `vmBusGUID` to the Utility VM with the provided settings.
AddDevice(ctx context.Context, vmBusGUID string, settings hcsschema.VirtualPciDevice) error

// RemoveDevice removes the vPCI device identified by `vmBusGUID` from the Utility VM.
RemoveDevice(ctx context.Context, vmBusGUID string) error
}

// linuxGuestVPCI exposes vPCI device operations in the LCOW guest.
// Implemented by [guestmanager.Guest].
type linuxGuestVPCI interface {
// AddVPCIDevice adds a vPCI device to the guest.
AddVPCIDevice(ctx context.Context, settings guestresource.LCOWMappedVPCIDevice) error
}

// ==============================================================================
// INTERNAL DATA STRUCTURES
// ==============================================================================

// deviceInfo records one vPCI device's assignment state and reference count.
type deviceInfo struct {
// device is the immutable host device identifier used to detect duplicate
// assignment requests.
device Device

// vmBusGUID identifies the vPCI device (backed by a VMBus channel)
// inside the UVM.
vmBusGUID guid.GUID

// refCount is the number of active callers sharing this device.
// Access must be guarded by [Controller.mu].
refCount uint32

// invalid indicates the host-side assignment succeeded but the guest-side
// assignment failed. Access must be guarded by [Controller.mu].
invalid bool
}
34 changes: 32 additions & 2 deletions internal/controller/device/vpci/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
package vpci

import (
"fmt"
"path/filepath"
"strconv"
)
Expand All @@ -17,6 +18,16 @@ const (
DeviceIDType = "vpci-instance-id"
)

const (
// vmBusChannelTypeGUIDFormatted is the well-known channel type GUID defined by
// VMBus for all assigned devices.
vmBusChannelTypeGUIDFormatted = "{44c4f61d-4444-4400-9d52-802e27ede19f}"

// assignedDeviceEnumerator is the VMBus enumerator prefix used in device
// instance IDs for assigned devices.
assignedDeviceEnumerator = "VMBUS"
)

// IsValidDeviceType returns true if the device type is valid i.e. supported by the runtime.
func IsValidDeviceType(deviceType string) bool {
return (deviceType == DeviceIDType) ||
Expand All @@ -30,9 +41,28 @@ func GetDeviceInfoFromPath(rawDevicePath string) (string, uint16) {
indexString := filepath.Base(rawDevicePath)
index, err := strconv.ParseUint(indexString, 10, 16)
if err == nil {
// we have a vf index
// We have a VF index.
return filepath.Dir(rawDevicePath), uint16(index)
}
// otherwise, just use default index and full device ID given
// Otherwise, just use default index and the full device ID as given.
return rawDevicePath, 0
}

// GetAssignedDeviceVMBUSInstanceID returns the instance ID of the VMBus channel
// device node created when a device is assigned to a UVM via vPCI.
//
// When a device is assigned to a UVM via vPCI support in HCS, a new VMBus channel device node is
// created in the UVM. The actual device that was assigned in is exposed as a child on this VMBus
// channel device node.
//
// A device node's instance ID is an identifier that distinguishes that device from other devices
// on the system. The GUID of a VMBus channel device node refers to that channel's unique
// identifier used internally by VMBus and can be used to determine the VMBus channel
// device node's instance ID.
//
// A VMBus channel device node's instance ID is in the form:
//
// "VMBUS\{channelTypeGUID}\{vmBusChannelGUID}"
func GetAssignedDeviceVMBUSInstanceID(vmBusChannelGUID string) string {
return fmt.Sprintf("%s\\%s\\{%s}", assignedDeviceEnumerator, vmBusChannelTypeGUIDFormatted, vmBusChannelGUID)
}
217 changes: 217 additions & 0 deletions internal/controller/device/vpci/vpci.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@
//go:build windows

package vpci

import (
"context"
"fmt"
"sync"

"github.com/Microsoft/go-winio/pkg/guid"
"github.com/Microsoft/hcsshim/internal/logfields"
"github.com/sirupsen/logrus"

hcsschema "github.com/Microsoft/hcsshim/internal/hcs/schema2"
"github.com/Microsoft/hcsshim/internal/log"
)

// Controller manages vPCI device assignments for a Utility VM.
type Controller struct {
mu sync.Mutex

// devices tracks currently assigned vPCI devices, keyed by VMBus GUID.
// Guarded by mu.
devices map[guid.GUID]*deviceInfo

// deviceToGUID maps a [Device] to its VMBus GUID for duplicate detection
// during [Controller.Reserve]. Guarded by mu.
deviceToGUID map[Device]guid.GUID

// vmVPCI performs host-side vPCI device add/remove on the VM.
vmVPCI vmVPCI

// linuxGuestVPCI performs guest-side vPCI device setup for LCOW.
linuxGuestVPCI linuxGuestVPCI
}

// New creates a ready-to-use [Controller].
func New(
vmVPCI vmVPCI,
linuxGuestVPCI linuxGuestVPCI,
) *Controller {
return &Controller{
vmVPCI: vmVPCI,
linuxGuestVPCI: linuxGuestVPCI,
devices: make(map[guid.GUID]*deviceInfo),
deviceToGUID: make(map[Device]guid.GUID),
}
}

// Reserve generates a unique VMBus GUID for the given vPCI device and records
// the reservation. The returned GUID can later be passed to [Controller.AddToVM]
// to actually assign the device to the VM.
//
// If the same device (identified by DeviceInstanceID and VirtualFunctionIndex) has
// already been reserved, the existing GUID is returned.
//
// Each Virtual Function is assigned as an independent guest device with its own
// VMBus GUID. Multiple Virtual Functions on the same physical device are treated
// as separate devices.
func (c *Controller) Reserve(ctx context.Context, device Device) (guid.GUID, error) {
ctx, _ = log.WithContext(ctx, logrus.WithFields(logrus.Fields{
logfields.DeviceID: device.DeviceInstanceID,
logfields.VFIndex: device.VirtualFunctionIndex,
}))

c.mu.Lock()
defer c.mu.Unlock()

// If this device is already reserved, return the existing GUID.
if existingGUID, ok := c.deviceToGUID[device]; ok {
log.G(ctx).WithField(logfields.VMBusGUID, existingGUID).Debug("vPCI device already reserved, reusing existing GUID")
return existingGUID, nil
}

// Generate a new VMBus GUID for this device.
vmBusGUID, err := guid.NewV4()
if err != nil {
return guid.GUID{}, fmt.Errorf("generate vmbus guid for device %s: %w", device.DeviceInstanceID, err)
}

c.devices[vmBusGUID] = &deviceInfo{
device: device,
vmBusGUID: vmBusGUID,
}
c.deviceToGUID[device] = vmBusGUID

log.G(ctx).WithField(logfields.VMBusGUID, vmBusGUID).Debug("reserved vPCI device with new VMBus GUID")
return vmBusGUID, nil
}

// AddToVM assigns a previously reserved vPCI device to the VM.
// The vmBusGUID must have been obtained from a prior call to [Controller.Reserve].
// If the device is already assigned to the VM, the existing assignment is reused.
func (c *Controller) AddToVM(ctx context.Context, vmBusGUID guid.GUID) error {
// Set vmBusGUID in logging context.
ctx, _ = log.WithContext(ctx, logrus.WithField(logfields.VMBusGUID, vmBusGUID))

c.mu.Lock()
defer c.mu.Unlock()

dev, ok := c.devices[vmBusGUID]
if !ok {
return fmt.Errorf("no reservation found for vmBusGUID %s; call Reserve first", vmBusGUID)
}

// If a previous assignment left the device in an invalid state,
// reject new callers until the existing assignment is cleaned up.
if dev.invalid {
return fmt.Errorf("vpci device with vmBusGUID %s is in an invalid state", vmBusGUID)
}

ctx, _ = log.WithContext(ctx, logrus.WithFields(logrus.Fields{
logfields.DeviceID: dev.device.DeviceInstanceID,
logfields.VFIndex: dev.device.VirtualFunctionIndex,
}))

// If the device is already assigned to the VM (host-side call was already made),
// just bump the reference count and return.
if dev.refCount > 0 {
dev.refCount++

log.G(ctx).Debug("vPCI device already assigned, reusing existing assignment")
return nil
}

// Device not yet attached to VM.
log.G(ctx).Debug("assigning vPCI device to VM")

// NUMA affinity is always propagated for assigned devices.
// This feature is available on WS2025 and later.
// Since the V2 shims only support WS2025 and later, this is set as true.
propagateAffinity := true

settings := hcsschema.VirtualPciDevice{
Functions: []hcsschema.VirtualPciFunction{
{
DeviceInstancePath: dev.device.DeviceInstanceID,
VirtualFunction: dev.device.VirtualFunctionIndex,
},
},
PropagateNumaAffinity: &propagateAffinity,
}

guidStr := vmBusGUID.String()

// Host-side: add the vPCI device to the VM.
if err := c.vmVPCI.AddDevice(ctx, guidStr, settings); err != nil {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this a guidstr? Its required to be a guid? If it needs to be a str do it in that AddDevice fn not before calling it.

return fmt.Errorf("add vpci device %s to vm: %w", dev.device.DeviceInstanceID, err)
}

// Update the ref count to indicate the device is now assigned to the VM.
dev.refCount++
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you want a state machine here? Device is added but if wait ready fails what start are we in?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just invalid is good enough I guess?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If add fails maybe mark it invalid as well and then its fine ?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So state machine seemed excessive here. I added a field isInvalid which took care of the same. Should we consider state machine for consistency?


// Guest-side: device attach notification.
if err := c.waitGuestDeviceReady(ctx, guidStr); err != nil {
// Mark the device as invalid so the caller can call RemoveFromVM
// to clean up the host-side assignment.
dev.invalid = true
log.G(ctx).WithError(err).Error("guest-side vpci device setup failed, device marked invalid")
return fmt.Errorf("add guest vpci device with vmBusGUID %s to vm: %w", vmBusGUID, err)
}

log.G(ctx).Info("vPCI device assigned to VM")

return nil
}

// RemoveFromVM removes a vPCI device from the VM.
// If the device is shared (reference count > 1), the reference count is
// decremented without actually removing the device from the VM.
func (c *Controller) RemoveFromVM(ctx context.Context, vmBusGUID guid.GUID) error {
c.mu.Lock()
defer c.mu.Unlock()

ctx, _ = log.WithContext(ctx, logrus.WithField(logfields.VMBusGUID, vmBusGUID))

dev, ok := c.devices[vmBusGUID]
if !ok {
return fmt.Errorf("no vpci device with vmBusGUID %s is assigned to the vm", vmBusGUID)
}

// Device was reserved but never added to the VM. Just clean up the reservation.
if dev.refCount == 0 {
log.G(ctx).Debug("vPCI device was reserved but never assigned, cleaning up reservation")

delete(c.devices, vmBusGUID)
delete(c.deviceToGUID, dev.device)

return nil
}

// Decrement the ref count for the device.
dev.refCount--
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You cant decrement until success

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just have it be that its a > 0 check for a simple -- otherwise its a do the things, and set to 0 for the last one.

if dev.refCount > 0 {
log.G(ctx).WithField("refCount", dev.refCount).Debug("vPCI device still in use, decremented ref count")
return nil
}

// Last reference dropped (refCount == 0). Remove the device from the VM.
// This also covers devices marked invalid during AddToVM — the host-side
// assignment still needs to be cleaned up.

log.G(ctx).Debug("removing vPCI device from VM")

// Host-side: remove the vPCI device from the VM.
if err := c.vmVPCI.RemoveDevice(ctx, vmBusGUID.String()); err != nil {
// Restore the ref count since the removal failed.
dev.refCount++
return fmt.Errorf("remove vpci device %s from vm: %w", vmBusGUID, err)
}

delete(c.devices, vmBusGUID)
delete(c.deviceToGUID, dev.device)

log.G(ctx).Info("vPCI device removed from VM")
return nil
}
17 changes: 17 additions & 0 deletions internal/controller/device/vpci/vpci_lcow.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
//go:build windows && lcow

package vpci

import (
"context"

"github.com/Microsoft/hcsshim/internal/protocol/guestresource"
)

// waitGuestDeviceReady notifies the guest about the new device and blocks until
// the required sysfs/device paths are available before workloads use them.
func (c *Controller) waitGuestDeviceReady(ctx context.Context, vmBusGUID string) error {
return c.linuxGuestVPCI.AddVPCIDevice(ctx, guestresource.LCOWMappedVPCIDevice{
VMBusGUID: vmBusGUID,
})
}
Loading
Loading