From 60c10e50672f66dec16c4ce266f7b534e87d0b34 Mon Sep 17 00:00:00 2001 From: Jeremi Piotrowski Date: Wed, 3 Apr 2024 12:53:44 +0000 Subject: [PATCH 01/19] platform/api/azure: Add features to published gallery We want to indicated support for trusted launch (TPM + secure boot) for testing. Trusted launch is only availabe for Gen2 VMs so we need some conditional template magic to continue working for Gen1 VMs. We also need to indicate image support for NVMe for testing NVMe only instances. For completeness we also mark support for accelerated networking. Signed-off-by: Jeremi Piotrowski --- platform/api/azure/gallery-image-template.json | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/platform/api/azure/gallery-image-template.json b/platform/api/azure/gallery-image-template.json index 794181de6..643f4785d 100644 --- a/platform/api/azure/gallery-image-template.json +++ b/platform/api/azure/gallery-image-template.json @@ -84,7 +84,15 @@ { "name": "DiskControllerTypes", "value": "NVMe,SCSI" - } + }, + { + "name": "SecurityType", + "value": "[if(equals(parameters('hyperVGeneration'), 'V2'), 'TrustedLaunchSupported', 'None')]" + }, + { + "name": "IsAcceleratedNetworkSupported", + "value": "true" + } ] }, "type": "Microsoft.Compute/galleries/images" From 3f5bbecfa9a26da70019d591df21ea6f4999140e Mon Sep 17 00:00:00 2001 From: Jeremi Piotrowski Date: Wed, 3 Apr 2024 14:46:28 +0000 Subject: [PATCH 02/19] cmd/ore/azure: Use flatcar instead of coreos in paths Signed-off-by: Jeremi Piotrowski --- cmd/ore/azure/upload-blob.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmd/ore/azure/upload-blob.go b/cmd/ore/azure/upload-blob.go index ba49fd940..21febdfb2 100644 --- a/cmd/ore/azure/upload-blob.go +++ b/cmd/ore/azure/upload-blob.go @@ -64,7 +64,7 @@ func init() { func defaultUploadFile() string { build := sdk.BuildRoot() - return build + "/images/amd64-usr/latest/coreos_production_azure_image.vhd" + return build + "/images/amd64-usr/latest/flatcar_production_azure_image.vhd" } func runUploadBlob(cmd *cobra.Command, args []string) { @@ -73,7 +73,7 @@ func runUploadBlob(cmd *cobra.Command, args []string) { if err != nil { plog.Fatalf("Unable to get version from image directory, provide a -blob-name flag or include a version.txt in the image directory: %v\n", err) } - ubo.blob = fmt.Sprintf("Container-Linux-dev-%s-%s.vhd", os.Getenv("USER"), ver.Version) + ubo.blob = fmt.Sprintf("flatcar-dev-%s-%s.vhd", os.Getenv("USER"), ver.Version) } if err := api.SetupClients(); err != nil { From 592ef9565ff9dfe8a1fccf2c6862f237c8969596 Mon Sep 17 00:00:00 2001 From: Jeremi Piotrowski Date: Wed, 3 Apr 2024 14:48:56 +0000 Subject: [PATCH 03/19] cmd/ore/azure: Call StartLogging manually Since all ore commands call WrapPreRun, they overwrite the default implementation that calls StartLogging(). To get plog to work StartLogging() must be called manually. Signed-off-by: Jeremi Piotrowski --- cli/cli.go | 4 ++-- cmd/ore/azure/azure.go | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/cli/cli.go b/cli/cli.go index 06b465af8..9ecaf5922 100644 --- a/cli/cli.go +++ b/cli/cli.go @@ -60,7 +60,7 @@ func Execute(main *cobra.Command) { "Alias for --log-level=DEBUG") WrapPreRun(main, func(cmd *cobra.Command, args []string) error { - startLogging(cmd) + StartLogging(cmd) return nil }) @@ -78,7 +78,7 @@ func setRepoLogLevel(repo string, l capnslog.LogLevel) { r.SetRepoLogLevel(l) } -func startLogging(cmd *cobra.Command) { +func StartLogging(cmd *cobra.Command) { switch { case logDebug: logLevel = capnslog.DEBUG diff --git a/cmd/ore/azure/azure.go b/cmd/ore/azure/azure.go index 9892411cd..4c3a61cac 100644 --- a/cmd/ore/azure/azure.go +++ b/cmd/ore/azure/azure.go @@ -42,6 +42,7 @@ func init() { } func preauth(cmd *cobra.Command, args []string) error { + cli.StartLogging(cmd) plog.Printf("Creating Azure API...") a, err := azure.New(&azure.Options{ From 126e656ae3f8caf1406ee737be348469ee1332a0 Mon Sep 17 00:00:00 2001 From: Jeremi Piotrowski Date: Wed, 3 Apr 2024 14:50:43 +0000 Subject: [PATCH 04/19] cmd/ore/azure: Initialize platform.Options So that commands can set fields like 'Board' later on. Signed-off-by: Jeremi Piotrowski --- cmd/ore/azure/azure.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cmd/ore/azure/azure.go b/cmd/ore/azure/azure.go index 4c3a61cac..cd22fdb0b 100644 --- a/cmd/ore/azure/azure.go +++ b/cmd/ore/azure/azure.go @@ -19,6 +19,7 @@ import ( "github.com/spf13/cobra" "github.com/flatcar/mantle/cli" + "github.com/flatcar/mantle/platform" "github.com/flatcar/mantle/platform/api/azure" ) @@ -47,6 +48,7 @@ func preauth(cmd *cobra.Command, args []string) error { a, err := azure.New(&azure.Options{ Location: azureLocation, + Options: &platform.Options{}, }) if err != nil { plog.Fatalf("Failed to create Azure API: %v", err) From 570a34064b23387e23474d369a09a3668869e457 Mon Sep 17 00:00:00 2001 From: Jeremi Piotrowski Date: Wed, 3 Apr 2024 14:53:18 +0000 Subject: [PATCH 05/19] cmd/ore/azure: Add command to create-gallery-image This ore command creates a gallery image for use in multiple VM creations, and could be used to speed up multiple kola invocations. The command takes care of uploading the vhd to blob storage and creates resource group and storage account as well. Signed-off-by: Jeremi Piotrowski --- cmd/ore/azure/create-gallery-image.go | 118 ++++++++++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100644 cmd/ore/azure/create-gallery-image.go diff --git a/cmd/ore/azure/create-gallery-image.go b/cmd/ore/azure/create-gallery-image.go new file mode 100644 index 000000000..f8e16409c --- /dev/null +++ b/cmd/ore/azure/create-gallery-image.go @@ -0,0 +1,118 @@ +// Copyright 2018 CoreOS, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package azure + +import ( + "encoding/json" + "fmt" + "os" + "path/filepath" + "strings" + + "github.com/flatcar/mantle/platform/api/azure" + "github.com/flatcar/mantle/sdk" + "github.com/spf13/cobra" +) + +var ( + cmdCreateGalleryImage = &cobra.Command{ + Use: "create-gallery-image", + Short: "Create Azure Gallery Image", + Long: "Create Azure Gallery Image mage from a VHD image", + RunE: runCreateGalleryImage, + } + + vhd string + blobName string + storageAccount string + resourceGrp string + hyperVGeneration string + board string +) + +func init() { + sv := cmdCreateGalleryImage.Flags().StringVar + + sv(&imageName, "image-name", "", "image name (optional)") + sv(&blobName, "blob-name", "", "source blob name (optional)") + sv(&vhd, "file", defaultUploadFile(), "source VHD file") + sv(&resourceGrp, "resource-group", "", "resource group name (optional)") + sv(&hyperVGeneration, "hyper-v-generation", "V2", "Hyper-V generation (V2 or V1)") + sv(&board, "board", "amd64-usr", "board name (amd64-usr or arm64-usr)") + sv(&storageAccount, "storage-account", "", "storage account name (optional)") + + Azure.AddCommand(cmdCreateGalleryImage) +} + +func azureSanitize(name string) string { + name = strings.Replace(name, ".", "-", -1) + name = strings.Replace(name, "+", "-", -1) + return name +} + +func runCreateGalleryImage(cmd *cobra.Command, args []string) error { + var err error + if err = api.SetupClients(); err != nil { + plog.Fatalf("setting up clients: %v\n", err) + } + api.Opts.Board = board + api.Opts.HyperVGeneration = hyperVGeneration + + if blobName == "" { + ver, err := sdk.VersionsFromDir(filepath.Dir(vhd)) + if err != nil { + plog.Fatalf("Unable to get version from image directory, provide a -blob-name flag or include a version.txt in the image directory: %v\n", err) + } + blobName = fmt.Sprintf("flatcar-dev-%s-%s", os.Getenv("USER"), ver.Version) + } + if imageName == "" { + imageName = azureSanitize(strings.TrimSuffix(blobName, ".vhd")) + } + if resourceGrp == "" { + resourceGrp, err = api.CreateResourceGroup("kola-cluster-image") + if err != nil { + plog.Fatalf("Couldn't create resource group: %v\n", err) + } + } + if storageAccount == "" { + storageAccount, err = api.CreateStorageAccount(resourceGrp) + if err != nil { + plog.Fatalf("Couldn't create storage account: %v\n", err) + } + } + client, err := api.GetBlobServiceClient(storageAccount) + if err != nil { + plog.Fatalf("failed to create blob service client for %q: %v", ubo.storageacct, err) + } + + container := "vhds" + if err := azure.UploadBlob(client, vhd, container, blobName, true); err != nil { + plog.Fatalf("Uploading blob failed: %v", err) + } + blobUrl := azure.BlobURL(client, container, blobName) + imgID, err := api.CreateGalleryImage(imageName, resourceGrp, storageAccount, blobUrl) + if err != nil { + plog.Fatalf("Couldn't create gallery image: %v\n", err) + } + err = json.NewEncoder(os.Stdout).Encode(&struct { + ID *string + }{ + ID: &imgID, + }) + if err != nil { + plog.Fatalf("Couldn't encode result: %v\n", err) + } + return nil +} From fee9ed94b9bee7684c21bc4237c1e0f7038c9101 Mon Sep 17 00:00:00 2001 From: Jeremi Piotrowski Date: Wed, 3 Apr 2024 15:51:20 +0000 Subject: [PATCH 06/19] platform/api/azure: Enable TPM for Gen2 instances Now that our gallery images are created with trusted launch support, we can enable trusted launch and TPM on the instance. At some point we'll be able to pass custom secure boot keys too. Signed-off-by: Jeremi Piotrowski --- platform/api/azure/instance.go | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/platform/api/azure/instance.go b/platform/api/azure/instance.go index df3fb8768..4b1f23365 100644 --- a/platform/api/azure/instance.go +++ b/platform/api/azure/instance.go @@ -155,6 +155,18 @@ func (a *API) getVMParameters(name, sshkey, storageAccountURI string, userdata * }, } + if a.Opts.HyperVGeneration == string(armcompute.HyperVGenerationTypeV2) && + (a.Opts.UseGallery || strings.Contains(a.Opts.DiskURI, "galleries")) && + a.Opts.Board == "amd64-usr" { + vm.Properties.SecurityProfile = &armcompute.SecurityProfile{ + SecurityType: to.Ptr(armcompute.SecurityTypesTrustedLaunch), + UefiSettings: &armcompute.UefiSettings{ + SecureBootEnabled: to.Ptr(false), + VTpmEnabled: to.Ptr(true), + }, + } + } + switch a.Opts.DiskController { case "nvme": vm.Properties.StorageProfile.DiskControllerType = to.Ptr(armcompute.DiskControllerTypesNVMe) From a0dc3d192231b79cf8e6d97bd4c3ca164e83aff1 Mon Sep 17 00:00:00 2001 From: Jeremi Piotrowski Date: Fri, 28 Jun 2024 09:40:18 +0200 Subject: [PATCH 07/19] platform/azure: Reuse single storage account for boot logs We currently create a storage account for every test cluster. Storage account creation takes 20-30 seconds. This storage account is only used for VM console logs. We can easily reuse a single storage account for all vm console logs, thereby speeding up the whole test execution. Introduce a field called storageAccountRG in struct cluster, so that we can keep track of the correct RG when fetching VM logs. Signed-off-by: Jeremi Piotrowski --- platform/machine/azure/cluster.go | 11 ++++++----- platform/machine/azure/flight.go | 33 ++++++++++++++----------------- 2 files changed, 21 insertions(+), 23 deletions(-) diff --git a/platform/machine/azure/cluster.go b/platform/machine/azure/cluster.go index 8b91cee6f..31ece6d29 100644 --- a/platform/machine/azure/cluster.go +++ b/platform/machine/azure/cluster.go @@ -27,11 +27,12 @@ import ( type cluster struct { *platform.BaseCluster - flight *flight - sshKey string - ResourceGroup string - StorageAccount string - Network azure.Network + flight *flight + sshKey string + ResourceGroup string + StorageAccountRG string + StorageAccount string + Network azure.Network } func (ac *cluster) vmname() string { diff --git a/platform/machine/azure/flight.go b/platform/machine/azure/flight.go index 834a4a7c7..aa0aa80fe 100644 --- a/platform/machine/azure/flight.go +++ b/platform/machine/azure/flight.go @@ -41,6 +41,7 @@ type flight struct { ImageResourceGroup string ImageStorageAccount string Network azure.Network + UseFlightRG bool } // NewFlight creates an instance of a Flight suitable for spawning @@ -79,21 +80,21 @@ func NewFlight(opts *azure.Options) (platform.Flight, error) { return nil, err } + af.ImageResourceGroup, err = af.Api.CreateResourceGroup("kola-cluster-image") + if err != nil { + return nil, err + } + af.ImageStorageAccount, err = af.Api.CreateStorageAccount(af.ImageResourceGroup) + if err != nil { + return nil, err + } + if opts.BlobURL != "" || opts.ImageFile != "" { + af.UseFlightRG = true imageName := fmt.Sprintf("%v", time.Now().UnixNano()) blobName := imageName + ".vhd" container := "temp" - af.ImageResourceGroup, err = af.Api.CreateResourceGroup("kola-cluster-image") - if err != nil { - return nil, err - } - - af.ImageStorageAccount, err = af.Api.CreateStorageAccount(af.ImageResourceGroup) - if err != nil { - return nil, err - } - af.Network, err = af.Api.PrepareNetworkResources(af.ImageResourceGroup) if err != nil { af.Destroy() @@ -162,21 +163,17 @@ func (af *flight) NewCluster(rconf *platform.RuntimeConfig) (platform.Cluster, e ac.sshKey = af.FakeSSHKey } - if af.ImageResourceGroup != "" && af.ImageStorageAccount != "" { + ac.StorageAccountRG = af.ImageResourceGroup + ac.StorageAccount = af.ImageStorageAccount + + if af.UseFlightRG { ac.ResourceGroup = af.ImageResourceGroup - ac.StorageAccount = af.ImageStorageAccount ac.Network = af.Network } else { ac.ResourceGroup, err = af.Api.CreateResourceGroup("kola-cluster") if err != nil { return nil, err } - - ac.StorageAccount, err = af.Api.CreateStorageAccount(ac.ResourceGroup) - if err != nil { - return nil, err - } - ac.Network, err = af.Api.PrepareNetworkResources(ac.ResourceGroup) if err != nil { ac.Destroy() From f5bd2b49847c856f9ff4b660172378092ea1994b Mon Sep 17 00:00:00 2001 From: Jeremi Piotrowski Date: Wed, 17 Apr 2024 16:56:37 +0000 Subject: [PATCH 08/19] make trusted launch opt-in --- cmd/kola/options.go | 1 + platform/api/azure/instance.go | 10 +++++++--- platform/api/azure/options.go | 1 + 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/cmd/kola/options.go b/cmd/kola/options.go index 2d9781080..514ca8fdd 100644 --- a/cmd/kola/options.go +++ b/cmd/kola/options.go @@ -124,6 +124,7 @@ func init() { sv(&kola.AzureOptions.ResourceGroup, "azure-resource-group", "", "Deploy resources in an existing resource group") sv(&kola.AzureOptions.AvailabilitySet, "azure-availability-set", "", "Deploy instances with an existing availibity set") sv(&kola.AzureOptions.KolaVnet, "azure-kola-vnet", "", "Pass the vnet/subnet that kola is being ran from to restrict network access to created storage accounts") + bv(&kola.AzureOptions.TrustedLaunch, "azure-trusted-launch", false, "Enable trusted launch for VMs (default \"false\")") // do-specific options sv(&kola.DOOptions.ConfigPath, "do-config-file", "", "DigitalOcean config file (default \"~/"+auth.DOConfigPath+"\")") diff --git a/platform/api/azure/instance.go b/platform/api/azure/instance.go index 4b1f23365..69e4838d9 100644 --- a/platform/api/azure/instance.go +++ b/platform/api/azure/instance.go @@ -155,9 +155,13 @@ func (a *API) getVMParameters(name, sshkey, storageAccountURI string, userdata * }, } - if a.Opts.HyperVGeneration == string(armcompute.HyperVGenerationTypeV2) && - (a.Opts.UseGallery || strings.Contains(a.Opts.DiskURI, "galleries")) && - a.Opts.Board == "amd64-usr" { + if a.Opts.TrustedLaunch { + if a.Opts.HyperVGeneration != string(armcompute.HyperVGenerationTypeV2) { + plog.Warningf("TrustedLaunch is only supported for HyperVGeneration v2; ignoring") + } + if a.Opts.Board != "amd64-usr" { + plog.Warningf("TrustedLaunch is only supported for amd64-usr; ignoring") + } vm.Properties.SecurityProfile = &armcompute.SecurityProfile{ SecurityType: to.Ptr(armcompute.SecurityTypesTrustedLaunch), UefiSettings: &armcompute.UefiSettings{ diff --git a/platform/api/azure/options.go b/platform/api/azure/options.go index e95a5bbd5..9690fe8e4 100644 --- a/platform/api/azure/options.go +++ b/platform/api/azure/options.go @@ -52,6 +52,7 @@ type Options struct { KolaVnet string UseGallery bool UsePrivateIPs bool + TrustedLaunch bool DiskController string From 2e8a354cd55e2fc1c7e03c1ff16a62223deb5001 Mon Sep 17 00:00:00 2001 From: Jeremi Piotrowski Date: Mon, 22 Apr 2024 12:29:46 +0200 Subject: [PATCH 09/19] platform/api/azure: Ignore -gen2 suffix on sku when resolving latest version Signed-off-by: Jeremi Piotrowski --- platform/api/azure/image.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/platform/api/azure/image.go b/platform/api/azure/image.go index afc1e346b..9ed7c1366 100644 --- a/platform/api/azure/image.go +++ b/platform/api/azure/image.go @@ -152,8 +152,8 @@ func (a *API) resolveImage() error { if a.Opts.DiskURI != "" || a.Opts.BlobURL != "" || a.Opts.ImageFile != "" || a.Opts.Version != "" || a.Opts.Sku == "" { return nil } - - resp, err := http.DefaultClient.Get(fmt.Sprintf("https://%s.release.flatcar-linux.net/amd64-usr/current/version.txt", a.Opts.Sku)) + sku := strings.TrimSuffix(a.Opts.Sku, "-gen2") + resp, err := http.DefaultClient.Get(fmt.Sprintf("https://%s.release.flatcar-linux.net/amd64-usr/current/version.txt", sku)) if err != nil { return fmt.Errorf("unable to fetch release bucket %v version: %v", a.Opts.Sku, err) } From a641d2aef21babe7a236ad715998e7f22c2ee59c Mon Sep 17 00:00:00 2001 From: Jeremi Piotrowski Date: Fri, 26 Apr 2024 12:07:09 +0000 Subject: [PATCH 10/19] kola: Use skip func to skip cl.misc.nvidia Signed-off-by: Jeremi Piotrowski --- kola/tests/misc/nvidia.go | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/kola/tests/misc/nvidia.go b/kola/tests/misc/nvidia.go index 7a6f24edf..49a1e6881 100644 --- a/kola/tests/misc/nvidia.go +++ b/kola/tests/misc/nvidia.go @@ -3,8 +3,10 @@ package misc import ( "bytes" "fmt" + "strings" "time" + "github.com/coreos/go-semver/semver" "github.com/coreos/pkg/capnslog" "github.com/flatcar/mantle/kola" "github.com/flatcar/mantle/kola/cluster" @@ -28,13 +30,19 @@ func init() { Platforms: []string{"azure"}, Architectures: []string{"amd64"}, Flags: []register.Flag{register.NoEnableSelinux}, + SkipFunc: skipOnNonGpu, }) } -func verifyNvidiaInstallation(c cluster.TestCluster) { - if kola.AzureOptions.Size != "Standard_NC6s_v3" { - c.Skip("skipping due to wrong instance size") +func skipOnNonGpu(version semver.Version, channel, arch, platform string) bool { + // N stands for GPU instance obviously :) + if platform == "azure" && strings.Contains(kola.AzureOptions.Size, "N") { + return false } + return true +} + +func verifyNvidiaInstallation(c cluster.TestCluster) { m := c.Machines()[0] nvidiaStatusRetry := func() error { From beb1ad03be25da6445e6e1f1443654fbf0424d62 Mon Sep 17 00:00:00 2001 From: Jeremi Piotrowski Date: Mon, 17 Jun 2024 10:21:03 +0200 Subject: [PATCH 11/19] kola: Make RunTests return on runtime failures So that the deferred flight.Destroy() is called. The only caller terminates the application on failure anyway. --- cmd/kola/kola.go | 6 ++---- kola/harness.go | 4 ++-- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/cmd/kola/kola.go b/cmd/kola/kola.go index 81d91b27b..8b831007f 100644 --- a/cmd/kola/kola.go +++ b/cmd/kola/kola.go @@ -134,13 +134,11 @@ func runRun(cmd *cobra.Command, args []string) { // needs to be after RunTests() because harness empties the directory if err := writeProps(); err != nil { - fmt.Fprintf(os.Stderr, "%v\n", err) - os.Exit(1) + plog.Fatal(err) } if runErr != nil { - fmt.Fprintf(os.Stderr, "%v\n", runErr) - os.Exit(1) + plog.Fatal(runErr) } } diff --git a/kola/harness.go b/kola/harness.go index cb59f5f5d..be66629f3 100644 --- a/kola/harness.go +++ b/kola/harness.go @@ -432,7 +432,7 @@ func RunTests(patterns []string, channel, offering, pltfrm, outputDir string, ss version, err := getClusterSemver(flight, outputDir) if err != nil { - plog.Fatal(err) + return fmt.Errorf("getClusterSemver: %w ", err) } // If the version is > 3033, we can safely use user-data instead of custom-data for @@ -453,7 +453,7 @@ func RunTests(patterns []string, channel, offering, pltfrm, outputDir string, ss // one more filter pass now that we know real version tests, err = FilterTests(tests, patterns, channel, offering, pltfrm, *version) if err != nil { - plog.Fatal(err) + return fmt.Errorf("FilterTests: %v", err) } } From 11ade1f5bf1c88f09349d436ee3104ea1156922f Mon Sep 17 00:00:00 2001 From: Jeremi Piotrowski Date: Mon, 29 Jul 2024 14:11:19 +0200 Subject: [PATCH 12/19] disable force vm delete This makes it harder to track down legit failures. --- platform/api/azure/instance.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/platform/api/azure/instance.go b/platform/api/azure/instance.go index 69e4838d9..bbc5dc282 100644 --- a/platform/api/azure/instance.go +++ b/platform/api/azure/instance.go @@ -225,7 +225,7 @@ func (a *API) CreateInstance(name, sshkey, resourceGroup, storageAccount string, clean := func() { _, _ = a.compClient.BeginDelete(context.TODO(), vmResourceGroup, name, &armcompute.VirtualMachinesClientBeginDeleteOptions{ - ForceDeletion: to.Ptr(true), + ForceDeletion: to.Ptr(false), }) _, _ = a.intClient.BeginDelete(context.TODO(), resourceGroup, *nic.Name, nil) _, _ = a.ipClient.BeginDelete(context.TODO(), resourceGroup, *ip.Name, nil) @@ -293,7 +293,7 @@ func (a *API) CreateInstance(name, sshkey, resourceGroup, storageAccount string, func (a *API) TerminateInstance(machine *Machine, resourceGroup string) error { resourceGroup = a.getVMRG(resourceGroup) _, err := a.compClient.BeginDelete(context.TODO(), resourceGroup, machine.ID, &armcompute.VirtualMachinesClientBeginDeleteOptions{ - ForceDeletion: to.Ptr(true), + ForceDeletion: to.Ptr(false), }) // We used to wait for the VM to be deleted here, but it's not necessary as // we will also delete the resource group later. From e391ca15d4caae3cde005748bffc4ed5ce02a331 Mon Sep 17 00:00:00 2001 From: Jeremi Piotrowski Date: Thu, 8 Aug 2024 13:00:50 +0200 Subject: [PATCH 13/19] TMP: azure: fetch console log from failed provisioning --- platform/api/azure/instance.go | 3 +-- platform/machine/azure/cluster.go | 13 ++++++++----- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/platform/api/azure/instance.go b/platform/api/azure/instance.go index bbc5dc282..31c695278 100644 --- a/platform/api/azure/instance.go +++ b/platform/api/azure/instance.go @@ -238,8 +238,7 @@ func (a *API) CreateInstance(name, sshkey, resourceGroup, storageAccount string, } _, err = poller.PollUntilDone(context.TODO(), nil) if err != nil { - clean() - return nil, err + return &Machine{ID: name}, fmt.Errorf("PollUntilDone: %w", err) } plog.Infof("Instance %s created", name) diff --git a/platform/machine/azure/cluster.go b/platform/machine/azure/cluster.go index 31ece6d29..90b2dcaaa 100644 --- a/platform/machine/azure/cluster.go +++ b/platform/machine/azure/cluster.go @@ -49,21 +49,24 @@ func (ac *cluster) NewMachine(userdata *conf.UserData) (platform.Machine, error) return nil, err } - instance, err := ac.flight.Api.CreateInstance(ac.vmname(), ac.sshKey, ac.ResourceGroup, ac.StorageAccount, conf, ac.Network) - if err != nil { - return nil, err - } - + instance, createErr := ac.flight.Api.CreateInstance(ac.vmname(), ac.sshKey, ac.ResourceGroup, ac.StorageAccount, conf, ac.Network) mach := &machine{ cluster: ac, mach: instance, } + if instance == nil { + return nil, createErr + } mach.dir = filepath.Join(ac.RuntimeConf().OutputDir, mach.ID()) if err := os.Mkdir(mach.dir, 0777); err != nil { mach.Destroy() return nil, err } + if createErr != nil { + mach.Destroy() + return nil, createErr + } confPath := filepath.Join(mach.dir, "user-data") if err := conf.WriteFile(confPath); err != nil { From 4e69e8c55c101f25a348cee79b8fe9e8690851e7 Mon Sep 17 00:00:00 2001 From: Jeremi Piotrowski Date: Thu, 8 Aug 2024 14:25:15 +0200 Subject: [PATCH 14/19] TMP: azure: timeout after 15s --- platform/api/azure/instance.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/platform/api/azure/instance.go b/platform/api/azure/instance.go index 31c695278..affdc7de1 100644 --- a/platform/api/azure/instance.go +++ b/platform/api/azure/instance.go @@ -236,7 +236,9 @@ func (a *API) CreateInstance(name, sshkey, resourceGroup, storageAccount string, clean() return nil, err } - _, err = poller.PollUntilDone(context.TODO(), nil) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) + defer cancel() + _, err = poller.PollUntilDone(ctx, nil) if err != nil { return &Machine{ID: name}, fmt.Errorf("PollUntilDone: %w", err) } From c38c12dd7681f40df0310753f924886e3afa71c0 Mon Sep 17 00:00:00 2001 From: Jeremi Piotrowski Date: Thu, 8 Aug 2024 14:25:27 +0200 Subject: [PATCH 15/19] TMP: retry: Sleep after failure --- util/retry.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/util/retry.go b/util/retry.go index 554f6fbe7..8fe43f0ba 100644 --- a/util/retry.go +++ b/util/retry.go @@ -57,8 +57,6 @@ func WaitUntilReady(timeout, delay time.Duration, checkFunction func() (bool, er default: } - time.Sleep(delay) - done, err := checkFunction() if err != nil { return err @@ -67,6 +65,8 @@ func WaitUntilReady(timeout, delay time.Duration, checkFunction func() (bool, er if done { break } + + time.Sleep(delay) } return nil } From 524e36fb06c4390d1c1179bd5634b03dbfd54f4e Mon Sep 17 00:00:00 2001 From: Jeremi Piotrowski Date: Mon, 5 Aug 2024 10:46:18 +0200 Subject: [PATCH 16/19] azure: Switch to managed boot diagnostics for console This does not require that the user have RBAC permissions to a storage account to fetch, because it uses SAS keys behind the scenes. The previous approach used a kola created storage account has Shared Key Access disabled for security reasons. Signed-off-by: Jeremi Piotrowski --- platform/api/azure/instance.go | 43 ++++++++-------------------------- 1 file changed, 10 insertions(+), 33 deletions(-) diff --git a/platform/api/azure/instance.go b/platform/api/azure/instance.go index affdc7de1..117e262a5 100644 --- a/platform/api/azure/instance.go +++ b/platform/api/azure/instance.go @@ -19,7 +19,7 @@ import ( "encoding/base64" "fmt" "io" - "regexp" + "net/http" "time" "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" @@ -148,8 +148,7 @@ func (a *API) getVMParameters(name, sshkey, storageAccountURI string, userdata * }, DiagnosticsProfile: &armcompute.DiagnosticsProfile{ BootDiagnostics: &armcompute.BootDiagnostics{ - Enabled: to.Ptr(true), - StorageURI: &storageAccountURI, + Enabled: to.Ptr(true), }, }, }, @@ -303,46 +302,24 @@ func (a *API) TerminateInstance(machine *Machine, resourceGroup string) error { func (a *API) GetConsoleOutput(name, resourceGroup, storageAccount string) ([]byte, error) { vmResourceGroup := a.getVMRG(resourceGroup) - vm, err := a.compClient.Get(context.TODO(), vmResourceGroup, name, &armcompute.VirtualMachinesClientGetOptions{ - Expand: to.Ptr(armcompute.InstanceViewTypesInstanceView), - }) + param := &armcompute.VirtualMachinesClientRetrieveBootDiagnosticsDataOptions{ + SasURIExpirationTimeInMinutes: to.Ptr[int32](5), + } + resp, err := a.compClient.RetrieveBootDiagnosticsData(context.TODO(), vmResourceGroup, name, param) if err != nil { return nil, fmt.Errorf("could not get VM: %v", err) } - - consoleURI := vm.Properties.InstanceView.BootDiagnostics.SerialConsoleLogBlobURI - if consoleURI == nil { + if resp.SerialConsoleLogBlobURI == nil { return nil, fmt.Errorf("serial console URI is nil") } - // Only the full URI to the logs are present in the virtual machine - // properties. Parse out the container & file name to use the GetBlob - // API call directly. - uri := []byte(*consoleURI) - containerPat := regexp.MustCompile(`bootdiagnostics-[a-z0-9\-]+`) - container := string(containerPat.Find(uri)) - if container == "" { - return nil, fmt.Errorf("could not find container name in URI: %q", *consoleURI) - } - namePat := regexp.MustCompile(`[a-z0-9\-\.]+.serialconsole.log`) - blobname := string(namePat.Find(uri)) - if blobname == "" { - return nil, fmt.Errorf("could not find blob name in URI: %q", *consoleURI) - } - - client, err := a.GetBlobServiceClient(storageAccount) - if err != nil { - return nil, err - } var data io.ReadCloser err = util.Retry(6, 10*time.Second, func() error { - data, err = GetBlob(client, container, blobname) + reply, err := http.Get(*resp.SerialConsoleLogBlobURI) if err != nil { - return fmt.Errorf("could not get blob for container %q, blobname %q: %v", container, blobname, err) - } - if data == nil { - return fmt.Errorf("empty data while getting blob for container %q, blobname %q", container, blobname) + return fmt.Errorf("could not GET console output: %v", err) } + data = reply.Body return nil }) if err != nil { From f253c39912ac66037cfc3ca7c36edd0572cbed64 Mon Sep 17 00:00:00 2001 From: Jeremi Piotrowski Date: Thu, 8 Aug 2024 14:54:17 +0200 Subject: [PATCH 17/19] TMP: azure: Save screenshot on failure --- platform/api/azure/instance.go | 42 ++++++++++++++++++++++++++++++- platform/machine/azure/machine.go | 10 ++++++++ 2 files changed, 51 insertions(+), 1 deletion(-) diff --git a/platform/api/azure/instance.go b/platform/api/azure/instance.go index 117e262a5..ec9b4e285 100644 --- a/platform/api/azure/instance.go +++ b/platform/api/azure/instance.go @@ -30,12 +30,20 @@ import ( "github.com/flatcar/mantle/util" ) +type MachineState int + +const ( + READY MachineState = iota + PROVISIONING +) + type Machine struct { ID string PublicIPAddress string PrivateIPAddress string InterfaceName string PublicIPName string + State MachineState } func (a *API) getAvset() string { @@ -239,7 +247,7 @@ func (a *API) CreateInstance(name, sshkey, resourceGroup, storageAccount string, defer cancel() _, err = poller.PollUntilDone(ctx, nil) if err != nil { - return &Machine{ID: name}, fmt.Errorf("PollUntilDone: %w", err) + return &Machine{ID: name, State: PROVISIONING}, fmt.Errorf("PollUntilDone: %w", err) } plog.Infof("Instance %s created", name) @@ -300,6 +308,38 @@ func (a *API) TerminateInstance(machine *Machine, resourceGroup string) error { return err } +func (a *API) GetScreenshot(name, resourceGroup string, output io.Writer) error { + vmResourceGroup := a.getVMRG(resourceGroup) + param := &armcompute.VirtualMachinesClientRetrieveBootDiagnosticsDataOptions{ + SasURIExpirationTimeInMinutes: to.Ptr[int32](5), + } + resp, err := a.compClient.RetrieveBootDiagnosticsData(context.TODO(), vmResourceGroup, name, param) + if err != nil { + return fmt.Errorf("could not get VM: %v", err) + } + if resp.ConsoleScreenshotBlobURI == nil { + return fmt.Errorf("console screenshot URI is nil") + } + + var data io.ReadCloser + err = util.Retry(6, 10*time.Second, func() error { + reply, err := http.Get(*resp.ConsoleScreenshotBlobURI) + if err != nil { + return fmt.Errorf("could not GET console screenshot: %v", err) + } + data = reply.Body + return nil + }) + if err != nil { + return err + } + written, err := io.Copy(output, data) + if err == nil { + plog.Debugf("wrote %d bytes to screenshot", written) + } + return err +} + func (a *API) GetConsoleOutput(name, resourceGroup, storageAccount string) ([]byte, error) { vmResourceGroup := a.getVMRG(resourceGroup) param := &armcompute.VirtualMachinesClientRetrieveBootDiagnosticsDataOptions{ diff --git a/platform/machine/azure/machine.go b/platform/machine/azure/machine.go index 2470f5dbb..4c2e80ff0 100644 --- a/platform/machine/azure/machine.go +++ b/platform/machine/azure/machine.go @@ -126,6 +126,16 @@ func (am *machine) saveConsole() error { return fmt.Errorf("failed writing console to file: %v", err) } + if am.mach.State == azure.PROVISIONING { + path := filepath.Join(am.dir, "screenshot.bmp") + f, err := os.OpenFile(path, os.O_WRONLY|os.O_CREATE, 0666) + if err != nil { + return err + } + defer f.Close() + return am.cluster.flight.Api.GetScreenshot(am.ID(), am.ResourceGroup(), f) + } + return nil } From 426d223c0dfb71ab3dcda278ba6251c5d6c9cf31 Mon Sep 17 00:00:00 2001 From: Jeremi Piotrowski Date: Thu, 8 Aug 2024 15:45:32 +0200 Subject: [PATCH 18/19] Revert "TMP: azure: timeout after 15s" This reverts commit 2647b48a652e4a006c4744fad739b92c63659a47. --- platform/api/azure/instance.go | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/platform/api/azure/instance.go b/platform/api/azure/instance.go index ec9b4e285..1eca18daa 100644 --- a/platform/api/azure/instance.go +++ b/platform/api/azure/instance.go @@ -243,9 +243,7 @@ func (a *API) CreateInstance(name, sshkey, resourceGroup, storageAccount string, clean() return nil, err } - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) - defer cancel() - _, err = poller.PollUntilDone(ctx, nil) + _, err = poller.PollUntilDone(context.TODO(), nil) if err != nil { return &Machine{ID: name, State: PROVISIONING}, fmt.Errorf("PollUntilDone: %w", err) } From 05e3cf32122c9c9781b654f87ffdffa9a1eca5cd Mon Sep 17 00:00:00 2001 From: Jeremi Piotrowski Date: Thu, 8 Aug 2024 16:46:56 +0200 Subject: [PATCH 19/19] TMP: azure: add instance name to provisioning failure log Signed-off-by: Jeremi Piotrowski --- platform/api/azure/instance.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/platform/api/azure/instance.go b/platform/api/azure/instance.go index 1eca18daa..079a884b7 100644 --- a/platform/api/azure/instance.go +++ b/platform/api/azure/instance.go @@ -245,7 +245,7 @@ func (a *API) CreateInstance(name, sshkey, resourceGroup, storageAccount string, } _, err = poller.PollUntilDone(context.TODO(), nil) if err != nil { - return &Machine{ID: name, State: PROVISIONING}, fmt.Errorf("PollUntilDone: %w", err) + return &Machine{ID: name, State: PROVISIONING}, fmt.Errorf("PollUntilDone(%s): %w", name, err) } plog.Infof("Instance %s created", name)