diff --git a/Makefile b/Makefile index b9687e802f..c22b9b2aa9 100644 --- a/Makefile +++ b/Makefile @@ -11,6 +11,10 @@ TAG?=test binary: CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -o $(BINARY_NAME) ./cmd/main.go +.PHONY: check-legacy-packages +check-legacy-packages: + go test ./tests/containerprofilecache -run TestLegacyPackagesDeleted + docker-build-only: docker buildx build --platform linux/amd64 -t $(IMAGE):$(TAG) -f $(DOCKERFILE_PATH) --load . diff --git a/cmd/main.go b/cmd/main.go index 75b79678fd..6fcaaca422 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -45,10 +45,9 @@ import ( "github.com/kubescape/node-agent/pkg/nodeprofilemanager" nodeprofilemanagerv1 "github.com/kubescape/node-agent/pkg/nodeprofilemanager/v1" "github.com/kubescape/node-agent/pkg/objectcache" - "github.com/kubescape/node-agent/pkg/objectcache/applicationprofilecache" + "github.com/kubescape/node-agent/pkg/objectcache/containerprofilecache" "github.com/kubescape/node-agent/pkg/objectcache/dnscache" "github.com/kubescape/node-agent/pkg/objectcache/k8scache" - "github.com/kubescape/node-agent/pkg/objectcache/networkneighborhoodcache" objectcachev1 "github.com/kubescape/node-agent/pkg/objectcache/v1" "github.com/kubescape/node-agent/pkg/processtree" containerprocesstree "github.com/kubescape/node-agent/pkg/processtree/container" @@ -297,16 +296,14 @@ func main() { ruleBindingNotify = make(chan rulebinding.RuleBindingNotify, 100) ruleBindingCache.AddNotifier(&ruleBindingNotify) - apc := applicationprofilecache.NewApplicationProfileCache(cfg, storageClient, k8sObjectCache, exporter) - apc.Start(ctx) - - nnc := networkneighborhoodcache.NewNetworkNeighborhoodCache(cfg, storageClient, k8sObjectCache, exporter) - nnc.Start(ctx) + cpc := containerprofilecache.NewContainerProfileCache(cfg, storageClient, k8sObjectCache, prometheusExporter) + cpc.Start(ctx) + logger.L().Info("ContainerProfileCache active; legacy AP/NN caches removed") dc := dnscache.NewDnsCache(dnsResolver) // create object cache - objCache = objectcachev1.NewObjectCache(k8sObjectCache, apc, nnc, dc) + objCache = objectcachev1.NewObjectCache(k8sObjectCache, cpc, dc) ruleCooldown := rulecooldown.NewRuleCooldown(cfg.RuleCoolDown) @@ -328,10 +325,9 @@ func main() { } else { ruleManager = rulemanager.CreateRuleManagerMock() - apc := &objectcache.ApplicationProfileCacheMock{} - nnc := &objectcache.NetworkNeighborhoodCacheMock{} + cpc := &objectcache.ContainerProfileCacheMock{} dc := &objectcache.DnsCacheMock{} - objCache = objectcachev1.NewObjectCache(k8sObjectCache, apc, nnc, dc) + objCache = objectcachev1.NewObjectCache(k8sObjectCache, cpc, dc) ruleBindingNotify = make(chan rulebinding.RuleBindingNotify, 1) } diff --git a/go.mod b/go.mod index 5dea4d016c..c74fe99f0a 100644 --- a/go.mod +++ b/go.mod @@ -35,7 +35,7 @@ require ( github.com/joncrlsn/dque v0.0.0-20241024143830-7723fd131a64 github.com/kubescape/backend v0.0.39 github.com/kubescape/go-logger v0.0.28 - github.com/kubescape/k8s-interface v0.0.206 + github.com/kubescape/k8s-interface v0.0.208 github.com/kubescape/storage v0.0.258 github.com/kubescape/workerpool v0.0.0-20250526074519-0e4a4e7f44cf github.com/moby/sys/mountinfo v0.7.2 @@ -60,6 +60,7 @@ require ( go.uber.org/multierr v1.11.0 golang.org/x/net v0.53.0 golang.org/x/sys v0.43.0 + golang.org/x/tools v0.43.0 gonum.org/v1/plot v0.14.0 google.golang.org/grpc v1.80.0 google.golang.org/protobuf v1.36.11 @@ -473,7 +474,6 @@ require ( golang.org/x/term v0.42.0 // indirect golang.org/x/text v0.36.0 // indirect golang.org/x/time v0.15.0 // indirect - golang.org/x/tools v0.43.0 // indirect golang.org/x/xerrors v0.0.0-20240903120638-7835f813f4da // indirect google.golang.org/api v0.271.0 // indirect google.golang.org/genproto v0.0.0-20260128011058-8636f8732409 // indirect diff --git a/go.sum b/go.sum index 689cf6d6c5..bbfd3daaff 100644 --- a/go.sum +++ b/go.sum @@ -1013,8 +1013,8 @@ github.com/kubescape/backend v0.0.39 h1:B1QRfKCSFlzuE+jWOnk/l7EpH71/Q3n14KKq0QSn github.com/kubescape/backend v0.0.39/go.mod h1:cMEGP8cXUZgY89YU4GRBGIla9HZW7grZsUtlCwvZgAE= github.com/kubescape/go-logger v0.0.28 h1:xulKTp9kOg3rD98sopFELQ6yZCHQoQXMDzteoSHDFKI= github.com/kubescape/go-logger v0.0.28/go.mod h1:YZHFjwGCDar1hP9OyBLE46oR7a0Y/Z/0FperDo8+9D0= -github.com/kubescape/k8s-interface v0.0.206 h1:qaYu4mlLmSBePanSGq+DBCssh4O785TAT0lQGNGWyGw= -github.com/kubescape/k8s-interface v0.0.206/go.mod h1:WNYUG93aZ5kDmuaRKFLtVhp18Yc6EfaHdD1gLYtVTN4= +github.com/kubescape/k8s-interface v0.0.208 h1:vmZ2FVAQRsz3XRKNG/6wJAYvZJ12RtMoDTLVxFEktms= +github.com/kubescape/k8s-interface v0.0.208/go.mod h1:WNYUG93aZ5kDmuaRKFLtVhp18Yc6EfaHdD1gLYtVTN4= github.com/kubescape/workerpool v0.0.0-20250526074519-0e4a4e7f44cf h1:hI0jVwrB6fT4GJWvuUjzObfci1CUknrZdRHfnRVtKM0= github.com/kubescape/workerpool v0.0.0-20250526074519-0e4a4e7f44cf/go.mod h1:Il5baM40PV9cTt4OGdLMeTRRAai3TMfvImu31itIeCM= github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= diff --git a/pkg/config/config.go b/pkg/config/config.go index 5e4807603b..dbc55b080f 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -107,6 +107,7 @@ type Config struct { ProcfsPidScanInterval time.Duration `mapstructure:"procfsPidScanInterval"` ProcfsScanInterval time.Duration `mapstructure:"procfsScanInterval"` ProfilesCacheRefreshRate time.Duration `mapstructure:"profilesCacheRefreshRate"` + StorageRPCBudget time.Duration `mapstructure:"storageRPCBudget"` RuleCoolDown rulecooldown.RuleCooldownConfig `mapstructure:"ruleCooldown"` TestMode bool `mapstructure:"testMode"` UpdateDataPeriod time.Duration `mapstructure:"updateDataPeriod"` diff --git a/pkg/containerprofilemanager/v1/lifecycle.go b/pkg/containerprofilemanager/v1/lifecycle.go index 44b689600b..dc9b8ac45a 100644 --- a/pkg/containerprofilemanager/v1/lifecycle.go +++ b/pkg/containerprofilemanager/v1/lifecycle.go @@ -162,6 +162,7 @@ func (cpm *ContainerProfileManager) addContainer(container *containercollection. // Setup monitoring timer sniffingTime := cpm.calculateSniffingTime(container) + sharedData.LearningPeriod = sniffingTime timer := time.AfterFunc(sniffingTime, func() { cpm.handleContainerMaxTime(container) }) diff --git a/pkg/containerwatcher/v2/container_watcher_collection.go b/pkg/containerwatcher/v2/container_watcher_collection.go index 834ecb4125..b919084aac 100644 --- a/pkg/containerwatcher/v2/container_watcher_collection.go +++ b/pkg/containerwatcher/v2/container_watcher_collection.go @@ -60,8 +60,7 @@ func (cw *ContainerWatcher) StartContainerCollection(ctx context.Context) error cw.containerCallbackAsync, cw.containerProcessTree.ContainerCallback, cw.containerProfileManager.ContainerCallback, - cw.objectCache.ApplicationProfileCache().ContainerCallback, - cw.objectCache.NetworkNeighborhoodCache().ContainerCallback, + cw.objectCache.ContainerProfileCache().ContainerCallback, cw.malwareManager.ContainerCallback, cw.ruleManager.ContainerCallback, cw.sbomManager.ContainerCallback, diff --git a/pkg/hostsensormanager/sensor_kubelet.go b/pkg/hostsensormanager/sensor_kubelet.go index 0950f5e1fc..dafb165773 100644 --- a/pkg/hostsensormanager/sensor_kubelet.go +++ b/pkg/hostsensormanager/sensor_kubelet.go @@ -4,8 +4,10 @@ import ( "context" "fmt" + logger "github.com/kubescape/go-logger" "github.com/kubescape/go-logger/helpers" "github.com/kubescape/k8s-interface/hostsensor" + "sigs.k8s.io/yaml" ) const ( @@ -25,6 +27,32 @@ var kubeletKubeConfigDefaultPathList = []string{ "/var/lib/kubelet/kubeconfig", } +var kubeletServiceFilePaths = []string{ + "/etc/systemd/system/kubelet.service", + "/usr/lib/systemd/system/kubelet.service", + "/lib/systemd/system/kubelet.service", +} + +const kubeletServiceDropInDir = "/etc/systemd/system/kubelet.service.d" + +// kubeletConfigYAML is a minimal subset of KubeletConfiguration for CA file extraction. +type kubeletConfigYAML struct { + Authentication struct { + X509 struct { + ClientCAFile string `json:"clientCAFile"` + } `json:"x509"` + } `json:"authentication"` +} + +// extractClientCAFromKubeletConfig parses kubelet config YAML and returns the clientCAFile path. +func extractClientCAFromKubeletConfig(content []byte) (string, error) { + var cfg kubeletConfigYAML + if err := yaml.Unmarshal(content, &cfg); err != nil { + return "", fmt.Errorf("failed to parse kubelet config: %w", err) + } + return cfg.Authentication.X509.ClientCAFile, nil +} + // KubeletInfoSensor implements the Sensor interface for kubelet info data type KubeletInfoSensor struct { nodeName string @@ -73,12 +101,31 @@ func (s *KubeletInfoSensor) Sense() (interface{}, error) { ret.KubeConfigFile = makeContaineredFileInfoFromListVerbose(ctx, kubeletProcess, kubeletKubeConfigDefaultPathList, true, helpers.String("in", "SenseKubeletInfo")) } - // Client CA + // Client CA: check cmdLine first, then fall back to kubelet config YAML if caFilePath, ok := kubeletProcess.GetArg(kubeletClientCAArgName); ok { ret.ClientCAFile = makeContaineredFileInfoVerbose(ctx, kubeletProcess, caFilePath, false, helpers.String("in", "SenseKubeletInfo")) + } else if ret.ConfigFile != nil && len(ret.ConfigFile.Content) > 0 { + if caFilePath, err := extractClientCAFromKubeletConfig(ret.ConfigFile.Content); err != nil { + logger.L().Debug("failed to extract clientCAFile from kubelet config", helpers.String("in", "SenseKubeletInfo"), helpers.Error(err)) + } else if caFilePath != "" { + ret.ClientCAFile = makeContaineredFileInfoVerbose(ctx, kubeletProcess, caFilePath, false, helpers.String("in", "SenseKubeletInfo")) + } } ret.CmdLine = kubeletProcess.RawCmd() + // Service files: main unit file and drop-in directory + for _, svcPath := range kubeletServiceFilePaths { + if fi := makeHostFileInfoVerbose(ctx, svcPath, false); fi != nil { + ret.ServiceFiles = append(ret.ServiceFiles, *fi) + break + } + } + if dropIns, err := makeHostDirFilesInfoVerbose(ctx, kubeletServiceDropInDir, false, 0); err == nil { + for _, fi := range dropIns { + ret.ServiceFiles = append(ret.ServiceFiles, *fi) + } + } + return &ret, nil } diff --git a/pkg/metricsmanager/metrics_manager_interface.go b/pkg/metricsmanager/metrics_manager_interface.go index 1542c13006..e6c20b62c2 100644 --- a/pkg/metricsmanager/metrics_manager_interface.go +++ b/pkg/metricsmanager/metrics_manager_interface.go @@ -20,4 +20,9 @@ type MetricsManager interface { ReportContainerStart() ReportContainerStop() ReportDedupEvent(eventType utils.EventType, duplicate bool) + ReportContainerProfileLegacyLoad(kind, completeness string) + SetContainerProfileCacheEntries(kind string, count float64) + ReportContainerProfileCacheHit(hit bool) + ReportContainerProfileReconcilerDuration(phase string, duration time.Duration) + ReportContainerProfileReconcilerEviction(reason string) } diff --git a/pkg/metricsmanager/metrics_manager_mock.go b/pkg/metricsmanager/metrics_manager_mock.go index 74424e07b1..70f118da8e 100644 --- a/pkg/metricsmanager/metrics_manager_mock.go +++ b/pkg/metricsmanager/metrics_manager_mock.go @@ -66,4 +66,9 @@ func (m *MetricsMock) ReportContainerStart() {} func (m *MetricsMock) ReportContainerStop() {} -func (m *MetricsMock) ReportDedupEvent(eventType utils.EventType, duplicate bool) {} +func (m *MetricsMock) ReportDedupEvent(eventType utils.EventType, duplicate bool) {} +func (m *MetricsMock) ReportContainerProfileLegacyLoad(_, _ string) {} +func (m *MetricsMock) SetContainerProfileCacheEntries(_ string, _ float64) {} +func (m *MetricsMock) ReportContainerProfileCacheHit(_ bool) {} +func (m *MetricsMock) ReportContainerProfileReconcilerDuration(_ string, _ time.Duration) {} +func (m *MetricsMock) ReportContainerProfileReconcilerEviction(_ string) {} diff --git a/pkg/metricsmanager/metrics_manager_noop.go b/pkg/metricsmanager/metrics_manager_noop.go index c797f348a1..092b5a5e46 100644 --- a/pkg/metricsmanager/metrics_manager_noop.go +++ b/pkg/metricsmanager/metrics_manager_noop.go @@ -22,3 +22,8 @@ func (m *MetricsNoop) ReportRuleEvaluationTime(_ string, _ utils.EventType, _ ti func (m *MetricsNoop) ReportContainerStart() {} func (m *MetricsNoop) ReportContainerStop() {} func (m *MetricsNoop) ReportDedupEvent(_ utils.EventType, _ bool) {} +func (m *MetricsNoop) ReportContainerProfileLegacyLoad(_, _ string) {} +func (m *MetricsNoop) SetContainerProfileCacheEntries(_ string, _ float64) {} +func (m *MetricsNoop) ReportContainerProfileCacheHit(_ bool) {} +func (m *MetricsNoop) ReportContainerProfileReconcilerDuration(_ string, _ time.Duration) {} +func (m *MetricsNoop) ReportContainerProfileReconcilerEviction(_ string) {} diff --git a/pkg/metricsmanager/prometheus/prometheus.go b/pkg/metricsmanager/prometheus/prometheus.go index 30211664e6..d729924ab5 100644 --- a/pkg/metricsmanager/prometheus/prometheus.go +++ b/pkg/metricsmanager/prometheus/prometheus.go @@ -63,6 +63,13 @@ type PrometheusMetric struct { // Dedup metrics dedupEventCounter *prometheus.CounterVec + // ContainerProfile cache metrics + cpCacheLegacyLoadsCounter *prometheus.CounterVec + cpCacheEntriesGauge *prometheus.GaugeVec + cpCacheHitCounter *prometheus.CounterVec + cpReconcilerDurationHistogram *prometheus.HistogramVec + cpReconcilerEvictionsCounter *prometheus.CounterVec + // Cache to avoid allocating Labels maps on every call ruleCounterCache map[string]prometheus.Counter rulePrefilteredCounterCache map[string]prometheus.Counter @@ -215,6 +222,29 @@ func NewPrometheusMetric() *PrometheusMetric { Help: "Total number of events processed by the dedup layer", }, []string{eventTypeLabel, "result"}), + // ContainerProfile cache metrics + cpCacheLegacyLoadsCounter: promauto.NewCounterVec(prometheus.CounterOpts{ + Name: "node_agent_user_profile_legacy_loads_total", + Help: "Number of times a user-authored legacy ApplicationProfile or NetworkNeighborhood was loaded into the ContainerProfileCache; will be removed in a future release.", + }, []string{"kind", "completeness"}), + cpCacheEntriesGauge: promauto.NewGaugeVec(prometheus.GaugeOpts{ + Name: "node_agent_containerprofile_cache_entries", + Help: "Current number of cached ContainerProfile entries per kind.", + }, []string{"kind"}), + cpCacheHitCounter: promauto.NewCounterVec(prometheus.CounterOpts{ + Name: "node_agent_containerprofile_cache_hit_total", + Help: "Total number of ContainerProfile cache lookups by result.", + }, []string{"result"}), + cpReconcilerDurationHistogram: promauto.NewHistogramVec(prometheus.HistogramOpts{ + Name: "node_agent_containerprofile_reconciler_duration_seconds", + Help: "Duration of ContainerProfile reconciler phases in seconds.", + Buckets: prometheus.DefBuckets, + }, []string{"phase"}), + cpReconcilerEvictionsCounter: promauto.NewCounterVec(prometheus.CounterOpts{ + Name: "node_agent_containerprofile_reconciler_evictions_total", + Help: "Total number of ContainerProfile cache evictions by reason.", + }, []string{"reason"}), + // Initialize counter caches ruleCounterCache: make(map[string]prometheus.Counter), rulePrefilteredCounterCache: make(map[string]prometheus.Counter), @@ -256,6 +286,11 @@ func (p *PrometheusMetric) Destroy() { prometheus.Unregister(p.containerStartCounter) prometheus.Unregister(p.containerStopCounter) prometheus.Unregister(p.dedupEventCounter) + prometheus.Unregister(p.cpCacheLegacyLoadsCounter) + prometheus.Unregister(p.cpCacheEntriesGauge) + prometheus.Unregister(p.cpCacheHitCounter) + prometheus.Unregister(p.cpReconcilerDurationHistogram) + prometheus.Unregister(p.cpReconcilerEvictionsCounter) // Unregister program ID metrics prometheus.Unregister(p.programRuntimeGauge) prometheus.Unregister(p.programRunCountGauge) @@ -432,3 +467,27 @@ func (p *PrometheusMetric) ReportDedupEvent(eventType utils.EventType, duplicate } p.dedupEventCounter.WithLabelValues(string(eventType), result).Inc() } + +func (p *PrometheusMetric) ReportContainerProfileLegacyLoad(kind, completeness string) { + p.cpCacheLegacyLoadsCounter.WithLabelValues(kind, completeness).Inc() +} + +func (p *PrometheusMetric) SetContainerProfileCacheEntries(kind string, count float64) { + p.cpCacheEntriesGauge.WithLabelValues(kind).Set(count) +} + +func (p *PrometheusMetric) ReportContainerProfileCacheHit(hit bool) { + result := "hit" + if !hit { + result = "miss" + } + p.cpCacheHitCounter.WithLabelValues(result).Inc() +} + +func (p *PrometheusMetric) ReportContainerProfileReconcilerDuration(phase string, duration time.Duration) { + p.cpReconcilerDurationHistogram.WithLabelValues(phase).Observe(duration.Seconds()) +} + +func (p *PrometheusMetric) ReportContainerProfileReconcilerEviction(reason string) { + p.cpReconcilerEvictionsCounter.WithLabelValues(reason).Inc() +} diff --git a/pkg/objectcache/applicationprofilecache/applicationprofilecache.go b/pkg/objectcache/applicationprofilecache/applicationprofilecache.go deleted file mode 100644 index 7875f4e741..0000000000 --- a/pkg/objectcache/applicationprofilecache/applicationprofilecache.go +++ /dev/null @@ -1,908 +0,0 @@ -package applicationprofilecache - -import ( - "context" - "fmt" - "strings" - "sync" - "time" - - "github.com/cenkalti/backoff/v5" - mapset "github.com/deckarep/golang-set/v2" - "github.com/goradd/maps" - containercollection "github.com/inspektor-gadget/inspektor-gadget/pkg/container-collection" - "github.com/kubescape/go-logger" - "github.com/kubescape/go-logger/helpers" - helpersv1 "github.com/kubescape/k8s-interface/instanceidhandler/v1/helpers" - "github.com/armosec/armoapi-go/armotypes" - "github.com/kubescape/node-agent/pkg/config" - "github.com/kubescape/node-agent/pkg/exporters" - "github.com/kubescape/node-agent/pkg/objectcache" - "github.com/kubescape/node-agent/pkg/objectcache/applicationprofilecache/callstackcache" - "github.com/kubescape/node-agent/pkg/resourcelocks" - "github.com/kubescape/node-agent/pkg/rulemanager/types" - "github.com/kubescape/node-agent/pkg/signature" - "github.com/kubescape/node-agent/pkg/signature/profiles" - "github.com/kubescape/node-agent/pkg/storage" - "github.com/kubescape/node-agent/pkg/utils" - "github.com/kubescape/storage/pkg/apis/softwarecomposition/v1beta1" -) - -// ContainerInfo holds container metadata we need for application profile mapping -type ContainerInfo struct { - ContainerID string - WorkloadID string - InstanceTemplateHash string - Namespace string - Name string - SeenContainerFromTheStart bool // True if container was seen from the start - UserDefinedProfile string -} - -// ContainerCallStackIndex maintains call stack search trees for a container -type ContainerCallStackIndex struct { - searchTree *callstackcache.CallStackSearchTree -} - -type ApplicationProfileCacheImpl struct { - cfg config.Config - workloadIDToProfile maps.SafeMap[string, *v1beta1.ApplicationProfile] - workloadIDToProfileState maps.SafeMap[string, *objectcache.ProfileState] // Tracks profile state even if not in cache - containerIDToInfo maps.SafeMap[string, *ContainerInfo] - profileToUserManagedIdentifier maps.SafeMap[string, string] // profileName -> user-managed profile unique identifier (This is used to prevent merging the same user-managed profile multiple times) - containerToCallStackIndex maps.SafeMap[string, *ContainerCallStackIndex] - storageClient storage.ProfileClient - k8sObjectCache objectcache.K8sObjectCache - exporter exporters.Exporter // Exporter for sending tamper detection alerts - updateInterval time.Duration - updateInProgress bool // Flag to track if update is in progress - updateMutex sync.Mutex // Mutex to protect the flag - containerLocks *resourcelocks.ResourceLocks // Locks for each container to prevent concurrent modifications -} - -// NewApplicationProfileCache creates a new application profile cache with periodic updates -func NewApplicationProfileCache(cfg config.Config, storageClient storage.ProfileClient, k8sObjectCache objectcache.K8sObjectCache, exporter exporters.Exporter) *ApplicationProfileCacheImpl { - updateInterval := utils.AddJitter(cfg.ProfilesCacheRefreshRate, 10) // Add 10% jitter to avoid high load on the storage - - apc := &ApplicationProfileCacheImpl{ - cfg: cfg, - workloadIDToProfile: maps.SafeMap[string, *v1beta1.ApplicationProfile]{}, - workloadIDToProfileState: maps.SafeMap[string, *objectcache.ProfileState]{}, - containerIDToInfo: maps.SafeMap[string, *ContainerInfo]{}, - profileToUserManagedIdentifier: maps.SafeMap[string, string]{}, - containerToCallStackIndex: maps.SafeMap[string, *ContainerCallStackIndex]{}, - storageClient: storageClient, - k8sObjectCache: k8sObjectCache, - exporter: exporter, - updateInterval: updateInterval, - containerLocks: resourcelocks.New(), - } - - return apc -} - -// Start begins the periodic update process -func (apc *ApplicationProfileCacheImpl) Start(ctx context.Context) { - go apc.periodicUpdate(ctx) -} - -// periodicUpdate periodically fetches and updates application profiles from storage -func (apc *ApplicationProfileCacheImpl) periodicUpdate(ctx context.Context) { - ticker := time.NewTicker(apc.updateInterval) - defer ticker.Stop() - - for { - select { - case <-ticker.C: - // Check if an update is already in progress - apc.updateMutex.Lock() - if apc.updateInProgress { - // Skip this update cycle - logger.L().Debug("skipping profile update: previous update still in progress") - apc.updateMutex.Unlock() - continue - } - - // Set the flag and release the lock before the potentially long-running call - apc.updateInProgress = true - apc.updateMutex.Unlock() - - // Run the update directly - apc.updateAllProfiles(ctx) - - // Mark the update as complete - apc.updateMutex.Lock() - apc.updateInProgress = false - apc.updateMutex.Unlock() - - case <-ctx.Done(): - logger.L().Info("ApplicationProfileCache periodic update stopped") - return - } - } -} - -// updateAllProfiles fetches all application profiles from storage and updates the cache -func (apc *ApplicationProfileCacheImpl) updateAllProfiles(ctx context.Context) { - // Get unique namespaces from container info - namespaces := apc.getNamespaces() - if len(namespaces) == 0 { - logger.L().Debug("no namespaces found in cache, skipping profile update") - return - } - - // Iterate over each namespace - for _, namespace := range namespaces { - // Get container IDs for this namespace - containerIDs := apc.getContainerIDsForNamespace(namespace) - if len(containerIDs) == 0 { - logger.L().Debug("no containers found for namespace, skipping", - helpers.String("namespace", namespace)) - continue - } - - // Get profiles list for this namespace - var profileList *v1beta1.ApplicationProfileList - continueToken := "" - for { - list, err := apc.storageClient.ListApplicationProfiles(namespace, int64(50), continueToken) - if err != nil { - logger.L().Error("failed to list application profiles", - helpers.String("namespace", namespace), - helpers.Error(err)) - break - } - - if profileList == nil { - profileList = list - } else { - profileList.Items = append(profileList.Items, list.Items...) - } - - continueToken = list.Continue - if continueToken == "" { - break - } - } - - if profileList == nil { - continue - } - - // Process each profile - for _, profile := range profileList.Items { - // Handle user-managed profiles - if isUserManagedProfile(&profile) { - apc.handleUserManagedProfile(&profile) - continue - } - - // Get the workload ID from profile - workloadID := apc.wlidKey(profile.Annotations[helpersv1.WlidMetadataKey], profile.Labels[helpersv1.TemplateHashKey]) - if workloadID == "" { - continue // this is the case for user-defined profiles - } - - // Update profile state regardless of whether we'll update the full profile - profileState := &objectcache.ProfileState{ - Completion: profile.Annotations[helpersv1.CompletionMetadataKey], - Status: profile.Annotations[helpersv1.StatusMetadataKey], - Name: profile.Name, - Error: nil, - } - apc.workloadIDToProfileState.Set(workloadID, profileState) - - // Only consider completed profiles - if profile.Annotations[helpersv1.StatusMetadataKey] != helpersv1.Completed { - continue - } - - // Check if this workload ID is used by any container in this namespace - workloadIDInUse := false - hasNewContainer := false // Track if any container using this workload was seen from start - for _, containerID := range containerIDs { - if containerInfo, exists := apc.containerIDToInfo.Load(containerID); exists && - containerInfo.WorkloadID == workloadID && - containerInfo.InstanceTemplateHash == profile.Labels[helpersv1.TemplateHashKey] { - workloadIDInUse = true - // If any container was seen from start, mark it - if containerInfo.SeenContainerFromTheStart { - hasNewContainer = true - } - } - } - - if !workloadIDInUse { - continue - } - - // If we have a "new" container (seen from start) and the profile is partial, - // skip it - we don't want to use partial profiles for containers we're tracking from the start - if hasNewContainer && profile.Annotations[helpersv1.CompletionMetadataKey] == helpersv1.Partial { - logger.L().Debug("updateAllProfiles: skipping partial profile for new container", - helpers.String("profileName", profile.Name), - helpers.String("workloadID", workloadID)) - continue - } - - // Update the profile in the cache - if existingProfile, exists := apc.workloadIDToProfile.Load(workloadID); exists { - // If the profile already exists and it's complete/completed, continue to the next one - if existingProfile.Annotations[helpersv1.CompletionMetadataKey] == helpersv1.Full { - continue - } - - // If the new profile is not complete and we already have a completed/partial one, skip it - if profile.Annotations[helpersv1.CompletionMetadataKey] != helpersv1.Full { - continue - } - } - - // Fetch the profile from storage - fullProfile, err := apc.storageClient.GetApplicationProfile(namespace, profile.Name) - if err != nil { - logger.L().Error("failed to get application profile", - helpers.String("workloadID", workloadID), - helpers.String("namespace", namespace), - helpers.String("profileName", profile.Name), - helpers.Error(err)) - // Update the profile state to indicate an error - profileState.Error = err - apc.workloadIDToProfileState.Set(workloadID, profileState) - continue - } - - // Verify signature if enabled - if err := apc.verifyApplicationProfile(fullProfile, workloadID, "profile", true); err != nil { - // Continue to next profile as per requirements: skip on verification failure - continue - } - - apc.workloadIDToProfile.Set(workloadID, fullProfile) - logger.L().Debug("application profile downloaded, starting anomaly detection", - helpers.String("workloadID", workloadID), - helpers.String("namespace", namespace), - helpers.String("status", profile.Annotations[helpersv1.StatusMetadataKey]), - helpers.String("completion", profile.Annotations[helpersv1.CompletionMetadataKey])) - - // Update call stack search trees for containers using this workload ID - for _, containerID := range containerIDs { - if containerInfo, exists := apc.containerIDToInfo.Load(containerID); exists && - containerInfo.WorkloadID == workloadID && - containerInfo.InstanceTemplateHash == profile.Labels[helpersv1.TemplateHashKey] { - // Create or update call stack search tree if not exists - apc.indexContainerCallStacks(containerID, containerInfo.Name, fullProfile) - } - } - } - // Continue to next namespace - } -} - -// verifyApplicationProfile verifies the profile signature. -// Always checks signed profiles for tamper (emits R1016 alert on tamper). -// When EnableSignatureVerification is true, also rejects tampered/unsigned profiles. -// Returns error if the profile should not be loaded, nil otherwise. -func (apc *ApplicationProfileCacheImpl) verifyApplicationProfile(profile *v1beta1.ApplicationProfile, workloadID, context string, recordFailure bool) error { - profileAdapter := profiles.NewApplicationProfileAdapter(profile) - - // Always check signed profiles for tamper, regardless of enforcement setting - if signature.IsSigned(profileAdapter) { - if err := signature.VerifyObject(profileAdapter); err != nil { - // Signed profile failed verification → tamper detected - logger.L().Warning(context+" signature verification failed (tamper detected)", - helpers.String("profile", profile.Name), - helpers.String("namespace", profile.Namespace), - helpers.String("workloadID", workloadID), - helpers.Error(err)) - - // Emit R1016 tamper alert - apc.emitTamperAlert(profile.Name, profile.Namespace, workloadID, "ApplicationProfile", err) - - if apc.cfg.EnableSignatureVerification { - if recordFailure { - apc.setVerificationFailed(workloadID, profile.Name, err) - } - return err - } - // Enforcement off: allow loading despite tamper - return nil - } - logger.L().Debug(context+" verification successful", - helpers.String("profile", profile.Name), - helpers.String("namespace", profile.Namespace)) - return nil - } - - // Profile is not signed - if apc.cfg.EnableSignatureVerification { - logger.L().Debug(context+" is not signed, skipping", - helpers.String("profile", profile.Name), - helpers.String("namespace", profile.Namespace), - helpers.String("workloadID", workloadID)) - if recordFailure { - apc.setVerificationFailed(workloadID, profile.Name, signature.ErrObjectNotSigned) - } - return signature.ErrObjectNotSigned - } - - return nil -} - -// emitTamperAlert sends an R1016 "Signed profile tampered" alert via the exporter. -func (apc *ApplicationProfileCacheImpl) emitTamperAlert(profileName, namespace, workloadID, objectKind string, verifyErr error) { - if apc.exporter == nil { - return - } - - ruleFailure := &types.GenericRuleFailure{ - BaseRuntimeAlert: armotypes.BaseRuntimeAlert{ - AlertName: "Signed profile tampered", - InfectedPID: 1, - Severity: 10, - FixSuggestions: "Investigate who modified the " + objectKind + " '" + profileName + "' in namespace '" + namespace + "'. Re-sign the profile after verifying its contents.", - }, - AlertType: armotypes.AlertTypeRule, - RuntimeProcessDetails: armotypes.ProcessTree{ - ProcessTree: armotypes.Process{ - PID: 1, - Comm: "node-agent", - }, - }, - RuleAlert: armotypes.RuleAlert{ - RuleDescription: fmt.Sprintf("Signed %s '%s' in namespace '%s' has been tampered with: %v", objectKind, profileName, namespace, verifyErr), - }, - RuntimeAlertK8sDetails: armotypes.RuntimeAlertK8sDetails{ - Namespace: namespace, - }, - RuleID: "R1016", - } - - // Populate workload details from workloadID if available - ruleFailure.SetWorkloadDetails(extractWlidFromWorkloadID(workloadID)) - - apc.exporter.SendRuleAlert(ruleFailure) -} - -// extractWlidFromWorkloadID extracts the wlid part from a "wlid/templateHash" key. -func extractWlidFromWorkloadID(workloadID string) string { - if idx := strings.LastIndex(workloadID, "/"); idx > 0 { - // workloadID format is "wlid://////" - // We need everything before the last "/" which is the templateHash - return workloadID[:idx] - } - return workloadID -} - -func (apc *ApplicationProfileCacheImpl) setVerificationFailed(workloadID, profileName string, err error) { - profileState := &objectcache.ProfileState{ - Completion: "failed", - Status: "verification-failed", - Name: profileName, - Error: err, - } - apc.workloadIDToProfileState.Set(workloadID, profileState) -} - -// handleUserManagedProfile handles user-managed profiles -func (apc *ApplicationProfileCacheImpl) handleUserManagedProfile(profile *v1beta1.ApplicationProfile) { - normalizedProfileName := strings.TrimPrefix(profile.Name, helpersv1.UserApplicationProfilePrefix) - userManagedProfileUniqueIdentifier := profile.ResourceVersion + string(profile.UID) - - // Create a unique tracking key for this user profile - profileKey := apc.profileKey(profile.Namespace, normalizedProfileName) - - // Check if we've already processed this exact version of the user-managed profile - if storedIdentifier, exists := apc.profileToUserManagedIdentifier.Load(profileKey); exists && - storedIdentifier == userManagedProfileUniqueIdentifier { - return - } - - // Find and collect the profile to merge - var toMerge struct { - wlid string - profile *v1beta1.ApplicationProfile - } - - apc.workloadIDToProfile.Range(func(wlid string, originalProfile *v1beta1.ApplicationProfile) bool { - if originalProfile.Name == normalizedProfileName && originalProfile.Namespace == profile.Namespace { - toMerge.wlid = wlid - toMerge.profile = originalProfile - logger.L().Debug("found matching profile for user-managed profile", - helpers.String("workloadID", wlid), - helpers.String("namespace", originalProfile.Namespace), - helpers.String("profileName", originalProfile.Name)) - // Stop iteration - return false - } - return true - }) - - // If we didn't find a matching profile, skip merging - if toMerge.profile == nil { - return - } - - // Fetch the full user profile - fullUserProfile, err := apc.storageClient.GetApplicationProfile(profile.Namespace, profile.Name) - if err != nil { - logger.L().Error("failed to get user-managed profile", - helpers.String("namespace", profile.Namespace), - helpers.String("profileName", profile.Name), - helpers.Error(err)) - return - } - - // Verify signature if enabled - if err := apc.verifyApplicationProfile(fullUserProfile, toMerge.wlid, "user-managed profile", false); err != nil { - return - } - - // Merge the user-managed profile with the normal profile - - // First, pull the original profile from the storage - originalProfile, err := apc.storageClient.GetApplicationProfile(toMerge.profile.Namespace, toMerge.profile.Name) - if err != nil { - logger.L().Error("failed to get original profile", - helpers.String("namespace", toMerge.profile.Namespace), - helpers.String("profileName", toMerge.profile.Name), - helpers.Error(err)) - return - } - // Merge the profiles - mergedProfile := apc.performMerge(originalProfile, fullUserProfile) - // Update the cache with the merged profile - apc.workloadIDToProfile.Set(toMerge.wlid, mergedProfile) - // Update profile state for the merged profile - profileState := &objectcache.ProfileState{ - Completion: mergedProfile.Annotations[helpersv1.CompletionMetadataKey], - Status: mergedProfile.Annotations[helpersv1.StatusMetadataKey], - Name: mergedProfile.Name, - Error: nil, - } - apc.workloadIDToProfileState.Set(toMerge.wlid, profileState) - - logger.L().Debug("merged user-managed profile with normal profile", - helpers.String("workloadID", toMerge.wlid), - helpers.String("namespace", profile.Namespace), - helpers.String("profileName", profile.Name)) - - // We need to index the call stacks for the merged profile here, but currently we don't support that. - - // Record that we've processed this version of the profile - apc.profileToUserManagedIdentifier.Set(profileKey, userManagedProfileUniqueIdentifier) -} - -// indexContainerCallStacks builds the search index for a container's call stacks and removes them from the profile -func (apc *ApplicationProfileCacheImpl) indexContainerCallStacks(containerID, containerName string, appProfile *v1beta1.ApplicationProfile) { - if appProfile == nil { - logger.L().Warning("ApplicationProfileCacheImpl - application profile is nil", - helpers.String("containerID", containerID), - helpers.String("containerName", containerName)) - return - } - - // Create a new call stack search tree - callStackSearchTree := callstackcache.NewCallStackSearchTree() - apc.containerToCallStackIndex.Set(containerID, &ContainerCallStackIndex{ - searchTree: callStackSearchTree, - }) - - // Iterate over the containers in the application profile - // Find the container in the profile and index its call stacks - for _, c := range appProfile.Spec.Containers { - if c.Name == containerName { - // Index all call stacks - for _, stack := range c.IdentifiedCallStacks { - callStackSearchTree.AddCallStack(stack) - } - - // Clear the call stacks to free memory - c.IdentifiedCallStacks = nil - break - } - } - - // Also check init containers - for _, c := range appProfile.Spec.InitContainers { - if c.Name == containerName { - for _, stack := range c.IdentifiedCallStacks { - callStackSearchTree.AddCallStack(stack) - } - - // Clear the call stacks to free memory - c.IdentifiedCallStacks = nil - break - } - } - - // And ephemeral containers - for _, c := range appProfile.Spec.EphemeralContainers { - if c.Name == containerName { - for _, stack := range c.IdentifiedCallStacks { - callStackSearchTree.AddCallStack(stack) - } - - // Clear the call stacks to free memory - c.IdentifiedCallStacks = nil - break - } - } -} - -// ContainerCallback handles container lifecycle events -func (apc *ApplicationProfileCacheImpl) ContainerCallback(notif containercollection.PubSubEvent) { - isHost := utils.IsHostContainer(notif.Container) - namespace := notif.Container.K8s.Namespace - if isHost { - namespace = "host" - } - switch notif.Type { - case containercollection.EventTypeAddContainer: - if !isHost && apc.cfg.IgnoreContainer(namespace, notif.Container.K8s.PodName, notif.Container.K8s.PodLabels) { - return - } - container := notif.Container - if isHost { - containerCopy := *notif.Container - containerCopy.K8s.Namespace = namespace - container = &containerCopy - } - go apc.addContainerWithTimeout(container) - case containercollection.EventTypeRemoveContainer: - if !isHost && apc.cfg.IgnoreContainer(namespace, notif.Container.K8s.PodName, notif.Container.K8s.PodLabels) { - return - } - go apc.deleteContainer(notif.Container.Runtime.ContainerID) - } -} - -// addContainerWithTimeout handles adding a container with a timeout to prevent hanging -func (apc *ApplicationProfileCacheImpl) addContainerWithTimeout(container *containercollection.Container) { - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) - defer cancel() - - done := make(chan error, 1) - go func() { - done <- apc.addContainer(container, ctx) - }() - - select { - case err := <-done: - if err != nil { - logger.L().Error("failed to add container to the cache", helpers.Error(err)) - } - case <-ctx.Done(): - logger.L().Error("timeout while adding container to the cache", - helpers.String("containerID", container.Runtime.ContainerID), - helpers.String("containerName", container.Runtime.ContainerName), - helpers.String("podName", container.K8s.PodName), - helpers.String("namespace", container.K8s.Namespace)) - } -} - -// addContainer adds a container to the cache -func (apc *ApplicationProfileCacheImpl) addContainer(container *containercollection.Container, ctx context.Context) error { - containerID := container.Runtime.ContainerID - - return apc.containerLocks.WithLockAndError(containerID, func() error { - // Get workload ID from shared data - sharedData, err := apc.waitForSharedContainerData(containerID, ctx) - if err != nil { - logger.L().Error("failed to get shared data for container", - helpers.String("containerID", containerID), - helpers.Error(err)) - return err - } - - workloadID := apc.wlidKey(sharedData.Wlid, sharedData.InstanceID.GetTemplateHash()) - if workloadID == "" { - logger.L().Debug("empty workloadID for container", helpers.String("containerID", containerID)) - return nil - } - - // If container restarts and profile is partial, delete it from cache - // This ensures we don't alert on activity we didn't see after restart - if existingProfile, exists := apc.workloadIDToProfile.Load(workloadID); exists && !sharedData.PreRunningContainer { - if existingProfile != nil && existingProfile.Annotations != nil { - completion := existingProfile.Annotations[helpersv1.CompletionMetadataKey] - if completion == helpersv1.Partial { - logger.L().Debug("deleting partial profile on container restart", - helpers.String("containerID", containerID), - helpers.String("workloadID", workloadID), - helpers.String("namespace", container.K8s.Namespace)) - - // Delete the profile from cache - profileKey := apc.profileKey(existingProfile.Namespace, existingProfile.Name) - apc.profileToUserManagedIdentifier.Delete(profileKey) - apc.workloadIDToProfile.Delete(workloadID) - - // Also delete call stack indices for all containers using this workload ID - // (including the current container if it exists from a previous run) - apc.containerToCallStackIndex.Delete(containerID) - apc.containerIDToInfo.Range(func(cID string, info *ContainerInfo) bool { - if info.WorkloadID == workloadID { - apc.containerToCallStackIndex.Delete(cID) - } - return true - }) - } - } - } else { - apc.workloadIDToProfileState.Set(workloadID, &objectcache.ProfileState{ - Error: fmt.Errorf("waiting for profile update"), - }) - } - - // Create container info - // Mark container as "seen from start" if it is not pre-running - containerInfo := &ContainerInfo{ - ContainerID: containerID, - WorkloadID: workloadID, - InstanceTemplateHash: sharedData.InstanceID.GetTemplateHash(), - Namespace: container.K8s.Namespace, - Name: container.Runtime.ContainerName, - SeenContainerFromTheStart: !sharedData.PreRunningContainer, - } - - // Check for user-defined profile - if userDefinedProfile, ok := container.K8s.PodLabels[helpersv1.UserDefinedProfileMetadataKey]; ok { - if userDefinedProfile != "" { - // Set the user-defined profile in container info - containerInfo.UserDefinedProfile = userDefinedProfile - // Fetch the profile from storage - // TODO should we cache user-defined profiles separately? - it could allow deduplication - fullProfile, err := apc.storageClient.GetApplicationProfile(container.K8s.Namespace, userDefinedProfile) - if err != nil { - logger.L().Error("failed to get user-defined profile", - helpers.String("containerID", containerID), - helpers.String("workloadID", workloadID), - helpers.String("namespace", container.K8s.Namespace), - helpers.String("profileName", userDefinedProfile), - helpers.Error(err)) - // Update the profile state to indicate an error - profileState := &objectcache.ProfileState{ - Error: err, - } - apc.workloadIDToProfileState.Set(workloadID, profileState) - return nil - } - - // Verify signature if enabled - if err := apc.verifyApplicationProfile(fullProfile, workloadID, "user-defined profile", false); err != nil { - // Update the profile state to indicate an error - profileState := &objectcache.ProfileState{ - Error: fmt.Errorf("signature verification failed: %w", err), - } - apc.workloadIDToProfileState.Set(workloadID, profileState) - // Skip caching the unverified profile - return nil - } - - // Update the profile in the cache - apc.workloadIDToProfile.Set(workloadID, fullProfile) - profileState := &objectcache.ProfileState{ - Completion: fullProfile.Annotations[helpersv1.CompletionMetadataKey], - Status: fullProfile.Annotations[helpersv1.StatusMetadataKey], - Name: fullProfile.Name, - } - apc.workloadIDToProfileState.Set(workloadID, profileState) - logger.L().Debug("user-defined application profile downloaded, starting anomaly detection", - helpers.String("containerID", containerID), - helpers.String("workloadID", workloadID), - helpers.String("namespace", container.K8s.Namespace), - helpers.String("profileName", userDefinedProfile)) - } - } - - // Add to container info map - apc.containerIDToInfo.Set(containerID, containerInfo) - - logger.L().Debug("container added to cache", - helpers.String("containerID", containerID), - helpers.String("workloadID", workloadID), - helpers.String("namespace", container.K8s.Namespace)) - - return nil - }) -} - -// deleteContainer deletes a container from the cache -func (apc *ApplicationProfileCacheImpl) deleteContainer(containerID string) { - apc.containerLocks.WithLock(containerID, func() { - // Get container info - containerInfo, exists := apc.containerIDToInfo.Load(containerID) - if !exists { - logger.L().Debug("containerID not found in cache", helpers.String("containerID", containerID)) - return - } - - // Clean up container info and call stack index - apc.containerIDToInfo.Delete(containerID) - apc.containerToCallStackIndex.Delete(containerID) - - // Check if any other container is using the same workload ID - workloadStillInUse := false - apc.containerIDToInfo.Range(func(_ string, info *ContainerInfo) bool { - if info.WorkloadID == containerInfo.WorkloadID { - workloadStillInUse = true - return false // Stop iteration - } - return true // Continue iteration - }) - - // If no other container is using the same workload ID, delete it from the cache - if !workloadStillInUse { - if profile, exists := apc.workloadIDToProfile.Load(containerInfo.WorkloadID); exists { - // Remove the profile from the cache - profileKey := apc.profileKey(profile.Namespace, profile.Name) - apc.profileToUserManagedIdentifier.Delete(profileKey) - } - apc.workloadIDToProfileState.Delete(containerInfo.WorkloadID) - apc.workloadIDToProfile.Delete(containerInfo.WorkloadID) - logger.L().Debug("deleted workloadID from cache", helpers.String("workloadID", containerInfo.WorkloadID)) - } - }) - - // Clean up the lock when done - call this outside the WithLock closure - apc.containerLocks.ReleaseLock(containerID) -} - -// waitForSharedContainerData waits for shared container data to be available -func (apc *ApplicationProfileCacheImpl) waitForSharedContainerData(containerID string, ctx context.Context) (*objectcache.WatchedContainerData, error) { - return backoff.Retry(ctx, func() (*objectcache.WatchedContainerData, error) { - if sharedData := apc.k8sObjectCache.GetSharedContainerData(containerID); sharedData != nil { - return sharedData, nil - } - return nil, fmt.Errorf("container %s not found in shared data", containerID) - }, backoff.WithBackOff(backoff.NewExponentialBackOff())) -} - -func (apc *ApplicationProfileCacheImpl) profileKey(namespace, name string) string { - return fmt.Sprintf("%s/%s", namespace, name) -} - -func (apc *ApplicationProfileCacheImpl) wlidKey(wlid, templateHash string) string { - return fmt.Sprintf("%s/%s", wlid, templateHash) -} - -func (apc *ApplicationProfileCacheImpl) performMerge(normalProfile, userManagedProfile *v1beta1.ApplicationProfile) *v1beta1.ApplicationProfile { - mergedProfile := normalProfile.DeepCopy() - - // Merge spec - mergedProfile.Spec.Containers = apc.mergeContainers(mergedProfile.Spec.Containers, userManagedProfile.Spec.Containers) - mergedProfile.Spec.InitContainers = apc.mergeContainers(mergedProfile.Spec.InitContainers, userManagedProfile.Spec.InitContainers) - mergedProfile.Spec.EphemeralContainers = apc.mergeContainers(mergedProfile.Spec.EphemeralContainers, userManagedProfile.Spec.EphemeralContainers) - - return mergedProfile -} - -func (apc *ApplicationProfileCacheImpl) mergeContainers(normalContainers, userManagedContainers []v1beta1.ApplicationProfileContainer) []v1beta1.ApplicationProfileContainer { - if len(userManagedContainers) != len(normalContainers) { - // If the number of containers don't match, we can't merge - logger.L().Warning("ApplicationProfileCacheImpl - failed to merge user-managed profile with base profile", - helpers.Int("normalContainers len", len(normalContainers)), - helpers.Int("userManagedContainers len", len(userManagedContainers)), - helpers.String("reason", "number of containers don't match")) - return normalContainers - } - - // Assuming the normalContainers are already in the correct Pod order - // We'll merge user containers at their corresponding positions - for i := range normalContainers { - for _, userContainer := range userManagedContainers { - if normalContainers[i].Name == userContainer.Name { - apc.mergeContainer(&normalContainers[i], &userContainer) - break - } - } - } - return normalContainers -} - -func (apc *ApplicationProfileCacheImpl) mergeContainer(normalContainer, userContainer *v1beta1.ApplicationProfileContainer) { - normalContainer.Capabilities = append(normalContainer.Capabilities, userContainer.Capabilities...) - normalContainer.Execs = append(normalContainer.Execs, userContainer.Execs...) - normalContainer.Opens = append(normalContainer.Opens, userContainer.Opens...) - normalContainer.Syscalls = append(normalContainer.Syscalls, userContainer.Syscalls...) - normalContainer.Endpoints = append(normalContainer.Endpoints, userContainer.Endpoints...) - for k, v := range userContainer.PolicyByRuleId { - if existingPolicy, exists := normalContainer.PolicyByRuleId[k]; exists { - normalContainer.PolicyByRuleId[k] = utils.MergePolicies(existingPolicy, v) - } else { - normalContainer.PolicyByRuleId[k] = v - } - } -} - -func isUserManagedProfile(appProfile *v1beta1.ApplicationProfile) bool { - return appProfile.Annotations != nil && - appProfile.Annotations[helpersv1.ManagedByMetadataKey] == helpersv1.ManagedByUserValue && - strings.HasPrefix(appProfile.GetName(), helpersv1.UserApplicationProfilePrefix) -} - -// GetApplicationProfile gets the application profile for a container -func (apc *ApplicationProfileCacheImpl) GetApplicationProfile(containerID string) *v1beta1.ApplicationProfile { - // Get container info - if containerInfo, exists := apc.containerIDToInfo.Load(containerID); exists { - workloadID := containerInfo.WorkloadID - if workloadID == "" { - return nil - } - - // Try to get profile from cache - if profile, exists := apc.workloadIDToProfile.Load(workloadID); exists { - if profile != nil { - return profile - } - } - } - - return nil -} - -// GetApplicationProfileState gets the profile state for a container -func (apc *ApplicationProfileCacheImpl) GetApplicationProfileState(containerID string) *objectcache.ProfileState { - // Get container info - containerInfo, exists := apc.containerIDToInfo.Load(containerID) - if !exists { - return &objectcache.ProfileState{ - Error: fmt.Errorf("container %s not found in cache", containerID), - } - } - - workloadID := containerInfo.WorkloadID - if workloadID == "" { - return &objectcache.ProfileState{ - Error: fmt.Errorf("no workload ID for container %s", containerID), - } - } - - // Try to get profile state from cache - if profileState, exists := apc.workloadIDToProfileState.Load(workloadID); exists { - if profileState != nil { - return profileState - } - return &objectcache.ProfileState{ - Error: fmt.Errorf("application profile state is nil for workload %s", workloadID), - } - } - - return &objectcache.ProfileState{ - Error: fmt.Errorf("profile state not found for workload ID %s", workloadID), - } -} - -// GetCallStackSearchTree gets the call stack index for a container -func (apc *ApplicationProfileCacheImpl) GetCallStackSearchTree(containerID string) *callstackcache.CallStackSearchTree { - if index, exist := apc.containerToCallStackIndex.Load(containerID); exist { - return index.searchTree - } - - return nil -} - -// getNamespaces retrieves all unique namespaces from the container info cache -func (apc *ApplicationProfileCacheImpl) getNamespaces() []string { - namespaceSet := mapset.NewSet[string]() - apc.containerIDToInfo.Range(func(_ string, info *ContainerInfo) bool { - namespaceSet.Add(info.Namespace) - return true - }) - return namespaceSet.ToSlice() -} - -// getContainerIDsForNamespace retrieves all container IDs for a given namespace -func (apc *ApplicationProfileCacheImpl) getContainerIDsForNamespace(namespace string) []string { - containerIDs := []string{} - apc.containerIDToInfo.Range(func(containerID string, info *ContainerInfo) bool { - if info.Namespace == namespace { - containerIDs = append(containerIDs, containerID) - } - return true - }) - return containerIDs -} - -// Ensure ApplicationProfileCacheImpl implements the ApplicationProfileCache interface -var _ objectcache.ApplicationProfileCache = (*ApplicationProfileCacheImpl)(nil) diff --git a/pkg/objectcache/applicationprofilecache/applicationprofilecache_test.go b/pkg/objectcache/applicationprofilecache/applicationprofilecache_test.go deleted file mode 100644 index 6a89edcb24..0000000000 --- a/pkg/objectcache/applicationprofilecache/applicationprofilecache_test.go +++ /dev/null @@ -1,103 +0,0 @@ -package applicationprofilecache - -import ( - "context" - "fmt" - "testing" - - "github.com/kubescape/node-agent/pkg/config" - "github.com/kubescape/node-agent/pkg/storage" - "github.com/kubescape/storage/pkg/apis/softwarecomposition/v1beta1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" -) - -// SpyProfileClient for testing pagination -type SpyProfileClient struct { - storage.ProfileClient - Profiles []v1beta1.ApplicationProfile - CallCount int -} - -func (m *SpyProfileClient) ListApplicationProfiles(namespace string, limit int64, cont string) (*v1beta1.ApplicationProfileList, error) { - m.CallCount++ - start := 0 - if cont != "" { - fmt.Sscanf(cont, "%d", &start) - } - - end := start + int(limit) - nextCont := "" - if end < len(m.Profiles) { - nextCont = fmt.Sprintf("%d", end) - } else { - end = len(m.Profiles) - } - - return &v1beta1.ApplicationProfileList{ - ListMeta: metav1.ListMeta{ - Continue: nextCont, - }, - Items: m.Profiles[start:end], - }, nil -} - -func (m *SpyProfileClient) GetApplicationProfile(namespace, name string) (*v1beta1.ApplicationProfile, error) { - // Return empty profile to avoid errors in update loop - return &v1beta1.ApplicationProfile{ - ObjectMeta: metav1.ObjectMeta{ - Name: name, - Namespace: namespace, - Annotations: map[string]string{ - "kubescape.io/completion": "complete", - "kubescape.io/status": "completed", - }, - }, - }, nil -} - -func TestPagination(t *testing.T) { - totalProfiles := 120 - profiles := make([]v1beta1.ApplicationProfile, totalProfiles) - for i := 0; i < totalProfiles; i++ { - profiles[i] = v1beta1.ApplicationProfile{ - ObjectMeta: metav1.ObjectMeta{ - Name: fmt.Sprintf("profile-%d", i), - Namespace: "default", - Annotations: map[string]string{ - "kubescape.io/completion": "complete", - "kubescape.io/status": "completed", - }, - Labels: map[string]string{ - "kubescape.io/wlid-template-hash": "hash", - }, - }, - } - } - - spy := &SpyProfileClient{Profiles: profiles} - - // mock k8s object cache is irrelevant since we inject container info directly - cache := NewApplicationProfileCache(config.Config{}, spy, nil, nil) - - // Inject a container so that "default" namespace is processed. - // The WorkloadID needs to match something if we want deeper logic to run, - // but for pagination of ListApplicationProfiles, we just need to get past `getContainerIDsForNamespace` check. - // AND we need to simulate at least one container to trigger the list call. - cache.containerIDToInfo.Set("test-container", &ContainerInfo{ - Namespace: "default", - WorkloadID: "wlid", - }) - - // Call the private method - cache.updateAllProfiles(context.Background()) - - // We expect 3 calls: - // 1. 0-50, returns continue="50" - // 2. 50-100, returns continue="100" - // 3. 100-120, returns continue="" - // (Implementation loop checks continueToken == "") - - if spy.CallCount != 3 { - t.Errorf("Expected 3 calls to ListApplicationProfiles, got %d", spy.CallCount) - } -} diff --git a/pkg/objectcache/applicationprofilecache_interface.go b/pkg/objectcache/applicationprofilecache_interface.go deleted file mode 100644 index 780efa23b4..0000000000 --- a/pkg/objectcache/applicationprofilecache_interface.go +++ /dev/null @@ -1,34 +0,0 @@ -package objectcache - -import ( - containercollection "github.com/inspektor-gadget/inspektor-gadget/pkg/container-collection" - "github.com/kubescape/node-agent/pkg/objectcache/applicationprofilecache/callstackcache" - "github.com/kubescape/storage/pkg/apis/softwarecomposition/v1beta1" -) - -type ApplicationProfileCache interface { - GetApplicationProfile(containerID string) *v1beta1.ApplicationProfile - GetApplicationProfileState(containerID string) *ProfileState - GetCallStackSearchTree(containerID string) *callstackcache.CallStackSearchTree - ContainerCallback(notif containercollection.PubSubEvent) -} - -var _ ApplicationProfileCache = (*ApplicationProfileCacheMock)(nil) - -type ApplicationProfileCacheMock struct { -} - -func (ap *ApplicationProfileCacheMock) GetApplicationProfile(_ string) *v1beta1.ApplicationProfile { - return nil -} - -func (ap *ApplicationProfileCacheMock) GetCallStackSearchTree(_ string) *callstackcache.CallStackSearchTree { - return nil -} - -func (ap *ApplicationProfileCacheMock) ContainerCallback(_ containercollection.PubSubEvent) { -} - -func (ap *ApplicationProfileCacheMock) GetApplicationProfileState(_ string) *ProfileState { - return nil -} diff --git a/pkg/objectcache/applicationprofilecache/callstackcache/callstackcache.go b/pkg/objectcache/callstackcache/callstackcache.go similarity index 100% rename from pkg/objectcache/applicationprofilecache/callstackcache/callstackcache.go rename to pkg/objectcache/callstackcache/callstackcache.go diff --git a/pkg/objectcache/applicationprofilecache/callstackcache/callstackcache_test.go b/pkg/objectcache/callstackcache/callstackcache_test.go similarity index 100% rename from pkg/objectcache/applicationprofilecache/callstackcache/callstackcache_test.go rename to pkg/objectcache/callstackcache/callstackcache_test.go diff --git a/pkg/objectcache/containerprofilecache/containerprofilecache.go b/pkg/objectcache/containerprofilecache/containerprofilecache.go new file mode 100644 index 0000000000..8185957a27 --- /dev/null +++ b/pkg/objectcache/containerprofilecache/containerprofilecache.go @@ -0,0 +1,617 @@ +// Package containerprofilecache provides a unified, container-keyed cache for ContainerProfile objects. +package containerprofilecache + +import ( + "context" + "fmt" + "sync" + "sync/atomic" + "time" + + "github.com/cenkalti/backoff/v5" + "github.com/goradd/maps" + containercollection "github.com/inspektor-gadget/inspektor-gadget/pkg/container-collection" + "github.com/kubescape/go-logger" + "github.com/kubescape/go-logger/helpers" + helpersv1 "github.com/kubescape/k8s-interface/instanceidhandler/v1/helpers" + "github.com/kubescape/node-agent/pkg/config" + "github.com/kubescape/node-agent/pkg/metricsmanager" + "github.com/kubescape/node-agent/pkg/objectcache" + "github.com/kubescape/node-agent/pkg/objectcache/callstackcache" + "github.com/kubescape/node-agent/pkg/resourcelocks" + "github.com/kubescape/node-agent/pkg/storage" + "github.com/kubescape/node-agent/pkg/utils" + "github.com/kubescape/storage/pkg/apis/softwarecomposition/v1beta1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// defaultReconcileInterval is the fallback refresh cadence when +// config.ProfilesCacheRefreshRate is zero. +// defaultStorageRPCBudget is the per-call timeout applied by refreshRPC when +// config.StorageRPCBudget is zero. +const ( + defaultReconcileInterval = 30 * time.Second + defaultStorageRPCBudget = 5 * time.Second +) + +// namespacedName is a minimal identifier for a legacy user-authored CRD +// (ApplicationProfile / NetworkNeighborhood) overlaid on a ContainerProfile. +type namespacedName struct { + Namespace string + Name string +} + +// CachedContainerProfile is the per-container cache entry. One entry per live +// containerID, populated on ContainerCallback (Add) and removed on Remove. +// +// Profile may be the raw storage-fetched pointer (Shared=true, fast path) or +// a DeepCopy with user-authored AP/NN overlays merged in (Shared=false). +// entry.Profile is read-only once stored; storage.ProfileClient returns +// fresh-decoded objects per call (thin wrapper over client-go typed client) +// so shared aliasing is safe. +type CachedContainerProfile struct { + Profile *v1beta1.ContainerProfile + State *objectcache.ProfileState + CallStackTree *callstackcache.CallStackSearchTree + + ContainerName string + PodName string + Namespace string + PodUID string + WorkloadID string + + // UserAPRef / UserNNRef are set when the entry was built with a legacy + // user-authored AP/NN overlay. Used by the reconciler to re-fetch on + // refresh and to key deprecation warnings. + UserAPRef *namespacedName + UserNNRef *namespacedName + + // CPName is the storage name of the ContainerProfile. Populated at + // addContainer time so the reconciler can re-fetch without re-querying + // shared data (which may have been evicted from K8sObjectCache by then). + CPName string + + // WorkloadName is the per-workload slug used to fetch the workload-level + // ApplicationProfile / NetworkNeighborhood (primary data source while the + // storage-side consolidated CP isn't publicly queryable) and, with the + // "ug-" prefix, the user-managed AP/NN. Populated at addContainer time. + WorkloadName string + + Shared bool // true iff Profile is the shared storage-fetched pointer (read-only) + RV string // ContainerProfile resourceVersion at last load + UserManagedAPRV string // user-managed AP (ug-) RV at last projection, "" if absent + UserManagedNNRV string // user-managed NN (ug-) RV at last projection, "" if absent + UserAPRV string // user-AP (label-referenced) resourceVersion at last projection, "" if no overlay + UserNNRV string // user-NN (label-referenced) resourceVersion at last projection, "" if no overlay +} + +// pendingContainer captures the minimum state needed to retry the initial +// ContainerProfile GET when the CP is not yet in storage at addContainer time. +// The reconciler iterates pending each tick, re-issues the GET, and promotes +// the entry to `entries` on success. Component-tests regression (PR #788) +// showed the legacy periodic-scan path was load-bearing; this is its +// equivalent in the point-lookup model. +type pendingContainer struct { + container *containercollection.Container + sharedData *objectcache.WatchedContainerData + cpName string + workloadName string +} + +// ContainerProfileCacheImpl is the unified container-keyed cache for ContainerProfile objects. +type ContainerProfileCacheImpl struct { + cfg config.Config + entries maps.SafeMap[string, *CachedContainerProfile] + pending maps.SafeMap[string, *pendingContainer] + containerLocks *resourcelocks.ResourceLocks + storageClient storage.ProfileClient + k8sObjectCache objectcache.K8sObjectCache + metricsManager metricsmanager.MetricsManager + + reconcileEvery time.Duration + rpcBudget time.Duration + refreshInProgress atomic.Bool + + // deprecationDedup tracks (kind|ns/name@rv) keys to emit one WARN log + // per legacy CRD resource-version across the process lifetime. + deprecationDedup sync.Map +} + +// NewContainerProfileCache creates a new ContainerProfileCacheImpl. +// metricsManager may be nil; internally we substitute a no-op so call sites +// don't need nil checks. +func NewContainerProfileCache(cfg config.Config, storageClient storage.ProfileClient, k8sObjectCache objectcache.K8sObjectCache, metricsManager metricsmanager.MetricsManager) *ContainerProfileCacheImpl { + reconcileEvery := utils.AddJitter(cfg.ProfilesCacheRefreshRate, 10) + if cfg.ProfilesCacheRefreshRate <= 0 { + reconcileEvery = defaultReconcileInterval + } + if metricsManager == nil { + metricsManager = metricsmanager.NewMetricsNoop() + } + rpcBudget := cfg.StorageRPCBudget + if rpcBudget <= 0 { + rpcBudget = defaultStorageRPCBudget + } + return &ContainerProfileCacheImpl{ + cfg: cfg, + containerLocks: resourcelocks.New(), + storageClient: storageClient, + k8sObjectCache: k8sObjectCache, + metricsManager: metricsManager, + reconcileEvery: reconcileEvery, + rpcBudget: rpcBudget, + } +} + +// refreshRPC calls fn with a context bounded by c.rpcBudget, enforcing a +// per-call SLO so a slow API server cannot stall a full reconciler burst. +func (c *ContainerProfileCacheImpl) refreshRPC(ctx context.Context, fn func(context.Context) error) error { + rpcCtx, cancel := context.WithTimeout(ctx, c.rpcBudget) + defer cancel() + return fn(rpcCtx) +} + +// Start begins the periodic reconciler goroutine. The loop evicts entries +// whose container is no longer Running and refreshes live entries' base CP + +// user AP/NN overlays. See reconciler.go for the tick loop and RPC-cost +// characterization. +func (c *ContainerProfileCacheImpl) Start(ctx context.Context) { + go c.tickLoop(ctx) +} + +// ContainerCallback handles container lifecycle events (add/remove). Mirrors +// the shape used by the legacy caches. +func (c *ContainerProfileCacheImpl) ContainerCallback(notif containercollection.PubSubEvent) { + isHost := utils.IsHostContainer(notif.Container) + namespace := notif.Container.K8s.Namespace + if isHost { + namespace = "host" + } + switch notif.Type { + case containercollection.EventTypeAddContainer: + if !isHost && c.cfg.IgnoreContainer(namespace, notif.Container.K8s.PodName, notif.Container.K8s.PodLabels) { + return + } + container := notif.Container + if isHost { + containerCopy := *notif.Container + containerCopy.K8s.Namespace = namespace + container = &containerCopy + } + go c.addContainerWithTimeout(container) + case containercollection.EventTypeRemoveContainer: + // Skip the ignore check on Remove: a container added before its pod + // labels matched the ignore filter would otherwise leak in the cache. + // The reconciler eviction path is the safety net, but a Remove event + // should always clean up regardless of current label state. + go c.deleteContainer(notif.Container.Runtime.ContainerID) + } +} + +// addContainerWithTimeout runs addContainer with a 10-minute cap to prevent +// a stuck storage client from wedging the callback goroutine. +func (c *ContainerProfileCacheImpl) addContainerWithTimeout(container *containercollection.Container) { + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) + defer cancel() + + done := make(chan error, 1) + go func() { + done <- c.addContainer(container, ctx) + }() + + select { + case err := <-done: + if err != nil { + logger.L().Error("failed to add container to the container-profile cache", helpers.Error(err)) + } + case <-ctx.Done(): + logger.L().Error("timeout while adding container to the container-profile cache", + helpers.String("containerID", container.Runtime.ContainerID), + helpers.String("containerName", container.Runtime.ContainerName), + helpers.String("podName", container.K8s.PodName), + helpers.String("namespace", container.K8s.Namespace)) + } +} + +// addContainer builds and stores a cache entry for the container: fetches +// the ContainerProfile from storage, optionally fetches user-authored AP/NN +// CRDs, projects them onto a DeepCopy (or fast-paths via shared pointer), and +// builds the call-stack search tree. +func (c *ContainerProfileCacheImpl) addContainer(container *containercollection.Container, ctx context.Context) error { + containerID := container.Runtime.ContainerID + + return c.containerLocks.WithLockAndError(containerID, func() error { + sharedData, err := c.waitForSharedContainerData(containerID, ctx) + if err != nil { + logger.L().Error("failed to get shared data for container", + helpers.String("containerID", containerID), + helpers.Error(err)) + return err + } + + // Names we need: + // cpName = per-container stable slug, for the consolidated CP. + // Kept for forward-compat; current storage does not + // publish a queryable consolidated CP at this name, + // so we treat a 404 as "not yet". + // workloadName = per-workload stable slug, where the server-side + // aggregation publishes the ApplicationProfile and + // NetworkNeighborhood CRs. Legacy caches read these + // directly; the new cache does the same while the + // server-side consolidated-CP plumbing matures. + cpName, err := sharedData.InstanceID.GetSlug(false) + if err != nil { + logger.L().Error("failed to compute container profile slug", + helpers.String("containerID", containerID), + helpers.Error(err)) + return err + } + workloadName, err := sharedData.InstanceID.GetSlug(true) + if err != nil { + logger.L().Error("failed to compute workload profile slug", + helpers.String("containerID", containerID), + helpers.Error(err)) + return err + } + + if populated := c.tryPopulateEntry(ctx, containerID, container, sharedData, cpName, workloadName); !populated { + // No profile data available yet (neither consolidated CP nor + // workload AP/NN have landed in storage). Record a pending entry; + // the reconciler will retry each tick until data shows up or the + // container stops. This preserves the legacy periodic-scan + // recovery that kicked in when profiles were created after + // container-start. + c.pending.Set(containerID, &pendingContainer{ + container: container, + sharedData: sharedData, + cpName: cpName, + workloadName: workloadName, + }) + c.metricsManager.SetContainerProfileCacheEntries("pending", float64(c.pending.Len())) + } + return nil + }) +} + +// tryPopulateEntry issues the CP GET (plus any user-AP/NN overlay) and +// installs the cache entry on success. Returns true iff an entry was +// installed. Must be called while holding containerLocks.WithLock(id). +func (c *ContainerProfileCacheImpl) tryPopulateEntry( + ctx context.Context, + containerID string, + container *containercollection.Container, + sharedData *objectcache.WatchedContainerData, + cpName, workloadName string, +) bool { + ns := container.K8s.Namespace + + // Fetch consolidated ContainerProfile. The storage server aggregates the + // per-tick time-series CPs (written by containerprofilemanager at names + // ending in a random UUID suffix) into a consolidated CP at the stable + // name returned by GetSlug(false). Until that aggregation runs the Get + // returns 404 — we record pending and the reconciler retries on each + // tick. + var ( + cp *v1beta1.ContainerProfile + cpErr error + ) + _ = c.refreshRPC(ctx, func(rctx context.Context) error { + cp, cpErr = c.storageClient.GetContainerProfile(rctx, ns, cpName) + return cpErr + }) + if cpErr != nil { + logger.L().Debug("ContainerProfile not yet available", + helpers.String("containerID", containerID), + helpers.String("namespace", ns), + helpers.String("name", cpName), + helpers.Error(cpErr)) + cp = nil + } + + // Fetch user-managed AP / NN published at "ug-". Legacy + // caches auto-detected these via the `kubescape.io/managed-by: User` + // annotation and merged them on top of the base profile; we read them + // directly by their well-known name instead, avoiding a List and an + // annotation filter. Both are optional: nil on 404. + var userManagedAP *v1beta1.ApplicationProfile + var userManagedNN *v1beta1.NetworkNeighborhood + if workloadName != "" { + ugName := helpersv1.UserApplicationProfilePrefix + workloadName + var ugAPErr error + _ = c.refreshRPC(ctx, func(rctx context.Context) error { + userManagedAP, ugAPErr = c.storageClient.GetApplicationProfile(rctx, ns, ugName) + return ugAPErr + }) + if ugAPErr != nil { + logger.L().Debug("user-managed ApplicationProfile not available", + helpers.String("containerID", containerID), + helpers.String("namespace", ns), + helpers.String("name", ugName), + helpers.Error(ugAPErr)) + userManagedAP = nil + } + ugNNName := helpersv1.UserNetworkNeighborhoodPrefix + workloadName + var ugNNErr error + _ = c.refreshRPC(ctx, func(rctx context.Context) error { + userManagedNN, ugNNErr = c.storageClient.GetNetworkNeighborhood(rctx, ns, ugNNName) + return ugNNErr + }) + if ugNNErr != nil { + logger.L().Debug("user-managed NetworkNeighborhood not available", + helpers.String("containerID", containerID), + helpers.String("namespace", ns), + helpers.String("name", ugNNName), + helpers.Error(ugNNErr)) + userManagedNN = nil + } + } + + // Fix (reviewer #3): if the consolidated CP is still Partial and this + // container is not PreRunning (i.e. we saw it start fresh after the + // agent was already up), the partial view belongs to a PREVIOUS container + // incarnation. Legacy caches explicitly deleted such partials on restart + // so rule evaluation fell through to "no profile" until a new Full + // profile arrived. Mirror that: keep pending, retry each tick. + if !sharedData.PreRunningContainer { + if cp != nil && cp.Annotations[helpersv1.CompletionMetadataKey] == helpersv1.Partial { + cp = nil + } + } + + // Fetch user-authored legacy CRDs when the pod carries the + // UserDefinedProfileMetadataKey label. Fix (reviewer #2): fetch + // independently of the base-CP result, so a container that only has a + // user-defined profile still gets a cache entry. Recording the refs is + // gated on successful fetch here (otherwise the projection has no data + // to merge); the reconciler's refresh path re-fetches on each tick so + // transient failures are recovered. + var userAP *v1beta1.ApplicationProfile + var userNN *v1beta1.NetworkNeighborhood + overlayName, hasOverlay := container.K8s.PodLabels[helpersv1.UserDefinedProfileMetadataKey] + if hasOverlay && overlayName != "" { + var userAPErr error + _ = c.refreshRPC(ctx, func(rctx context.Context) error { + userAP, userAPErr = c.storageClient.GetApplicationProfile(rctx, ns, overlayName) + return userAPErr + }) + if userAPErr != nil { + logger.L().Debug("user-defined ApplicationProfile not available", + helpers.String("containerID", containerID), + helpers.String("namespace", ns), + helpers.String("name", overlayName), + helpers.Error(userAPErr)) + userAP = nil + } + var userNNErr error + _ = c.refreshRPC(ctx, func(rctx context.Context) error { + userNN, userNNErr = c.storageClient.GetNetworkNeighborhood(rctx, ns, overlayName) + return userNNErr + }) + if userNNErr != nil { + logger.L().Debug("user-defined NetworkNeighborhood not available", + helpers.String("containerID", containerID), + helpers.String("namespace", ns), + helpers.String("name", overlayName), + helpers.Error(userNNErr)) + userNN = nil + } + } + + // Need SOMETHING to cache. If we have nothing, stay pending and retry. + if cp == nil && userManagedAP == nil && userManagedNN == nil && userAP == nil && userNN == nil { + return false + } + + // When no consolidated CP is available, synthesize an empty CP named + // after the workload so downstream state display is sensible. Projection + // below merges user-managed + user-defined overlay onto this base. + if cp == nil { + syntheticName := workloadName + if syntheticName == "" { + syntheticName = overlayName + } + cp = &v1beta1.ContainerProfile{ + ObjectMeta: metav1.ObjectMeta{ + Name: syntheticName, + Namespace: ns, + Annotations: map[string]string{ + helpersv1.CompletionMetadataKey: helpersv1.Full, + helpersv1.StatusMetadataKey: helpersv1.Completed, + }, + }, + } + } + + pod := c.k8sObjectCache.GetPod(container.K8s.Namespace, container.K8s.PodName) + if pod == nil { + logger.L().Debug("pod not found in k8s cache; skipping pod-aware merge checks", + helpers.String("containerID", containerID), + helpers.String("namespace", container.K8s.Namespace), + helpers.String("podName", container.K8s.PodName)) + } + + // User-managed projection pass (published at the + // "ug-" well-known name). Legacy caches auto-merged these + // in handleUserManagedProfile after detecting the managed-by annotation; + // here we always union in whatever's published at the convention name. + // This is what Test_12_MergingProfilesTest / Test_13_MergingNetworkNeighborhoodTest + // exercise: rules must alert on events absent from the merged base+user-managed + // profile. + userManagedApplied := userManagedAP != nil || userManagedNN != nil + if userManagedApplied { + projected, warnings := projectUserProfiles(cp, userManagedAP, userManagedNN, pod, container.Runtime.ContainerName) + cp = projected + c.emitOverlayMetrics(userManagedAP, userManagedNN, warnings) + } + + entry := c.buildEntry(cp, userAP, userNN, pod, container, sharedData, userManagedApplied) + // Override CPName with the real consolidated-CP slug. buildEntry sets + // CPName from cp.Name, but when cp was synthesized above (no consolidated + // CP in storage yet), cp.Name is the workloadName/overlayName — NOT the + // GetSlug(false) name refreshOneEntry must GET. Without this override, + // refresh queries the synthetic name, always 404s, and the fast-skip + // keeps the synthetic entry forever (stored RV is "" == absent-match). + entry.CPName = cpName + // Fill in user-managed bookkeeping so refreshOneEntry can re-fetch these + // sources on every tick. WorkloadName is the "ug-" lookup prefix. + entry.WorkloadName = workloadName + if userManagedAP != nil { + entry.UserManagedAPRV = userManagedAP.ResourceVersion + } + if userManagedNN != nil { + entry.UserManagedNNRV = userManagedNN.ResourceVersion + } + + // Fix (reviewer #2): when the overlay label is set, record UserAPRef / + // UserNNRef even if the initial fetch failed. The refresh loop uses + // these refs to re-fetch on every tick; without them, a transient 404 + // at add time would permanently lose the overlay. + if hasOverlay && overlayName != "" { + if entry.UserAPRef == nil { + entry.UserAPRef = &namespacedName{Namespace: ns, Name: overlayName} + } + if entry.UserNNRef == nil { + entry.UserNNRef = &namespacedName{Namespace: ns, Name: overlayName} + } + } + + c.entries.Set(containerID, entry) + c.pending.Delete(containerID) + c.metricsManager.SetContainerProfileCacheEntries("container", float64(c.entries.Len())) + c.metricsManager.SetContainerProfileCacheEntries("pending", float64(c.pending.Len())) + + logger.L().Debug("ContainerProfileCache - container added", + helpers.String("containerID", containerID), + helpers.String("namespace", container.K8s.Namespace), + helpers.String("podName", container.K8s.PodName), + helpers.String("cpName", cpName), + helpers.String("shared", fmt.Sprintf("%v", entry.Shared))) + return true +} + +// buildEntry constructs a CachedContainerProfile, choosing the fast-path +// (shared pointer, no user overlay) or projection path (DeepCopy + merge). +func (c *ContainerProfileCacheImpl) buildEntry( + cp *v1beta1.ContainerProfile, + userAP *v1beta1.ApplicationProfile, + userNN *v1beta1.NetworkNeighborhood, + pod *corev1.Pod, + container *containercollection.Container, + sharedData *objectcache.WatchedContainerData, + userManagedApplied bool, +) *CachedContainerProfile { + entry := &CachedContainerProfile{ + ContainerName: container.Runtime.ContainerName, + PodName: container.K8s.PodName, + Namespace: container.K8s.Namespace, + WorkloadID: sharedData.Wlid + "/" + sharedData.InstanceID.GetTemplateHash(), + CPName: cp.Name, + RV: cp.ResourceVersion, + } + if pod != nil { + entry.PodUID = string(pod.UID) + } + + if userAP == nil && userNN == nil && !userManagedApplied { + // Fast path: share the storage-fetched pointer. Profile is the raw + // storage object — callers must not mutate it. + entry.Profile = cp + entry.Shared = true + } else { + projected, warnings := projectUserProfiles(cp, userAP, userNN, pod, container.Runtime.ContainerName) + entry.Profile = projected + entry.Shared = false + + if userAP != nil { + entry.UserAPRef = &namespacedName{Namespace: userAP.Namespace, Name: userAP.Name} + entry.UserAPRV = userAP.ResourceVersion + } + if userNN != nil { + entry.UserNNRef = &namespacedName{Namespace: userNN.Namespace, Name: userNN.Name} + entry.UserNNRV = userNN.ResourceVersion + } + + c.emitOverlayMetrics(userAP, userNN, warnings) + } + + // Build call-stack search tree from entry.Profile.Spec.IdentifiedCallStacks. + // Shared path: do not mutate the storage-fetched pointer; call stacks + // stay in the profile but are never read through Profile (only through + // CallStackTree). + tree := callstackcache.NewCallStackSearchTree() + for _, stack := range entry.Profile.Spec.IdentifiedCallStacks { + tree.AddCallStack(stack) + } + entry.CallStackTree = tree + + // ProfileState from CP annotations (Completion/Status) + Name. + entry.State = &objectcache.ProfileState{ + Completion: cp.Annotations[helpersv1.CompletionMetadataKey], + Status: cp.Annotations[helpersv1.StatusMetadataKey], + Name: cp.Name, + } + + return entry +} + +// deleteContainer removes a container entry. The per-container lock entry is +// intentionally NOT released: Phase-4 review flagged a race where a concurrent +// addContainer can hold a reference to the old mutex while a subsequent +// GetLock creates a new one, breaking mutual exclusion. Memory cost is bounded +// by the node's container-ID churn (live containers + recently-deleted), so +// keeping stale lock entries is cheaper than getting the atomic-release right. +func (c *ContainerProfileCacheImpl) deleteContainer(id string) { + c.containerLocks.WithLock(id, func() { + c.entries.Delete(id) + c.pending.Delete(id) + }) + c.metricsManager.SetContainerProfileCacheEntries("container", float64(c.entries.Len())) + c.metricsManager.SetContainerProfileCacheEntries("pending", float64(c.pending.Len())) +} + +// GetContainerProfile returns the cached ContainerProfile pointer for a +// container, or nil if there is no entry. Reports a cache-hit metric. +func (c *ContainerProfileCacheImpl) GetContainerProfile(containerID string) *v1beta1.ContainerProfile { + if entry, ok := c.entries.Load(containerID); ok && entry != nil && entry.Profile != nil { + c.metricsManager.ReportContainerProfileCacheHit(true) + return entry.Profile + } + c.metricsManager.ReportContainerProfileCacheHit(false) + return nil +} + +// GetContainerProfileState returns the cached ProfileState for a container +// (completion/status/name). Returns a synthetic error state when the entry +// is missing. +func (c *ContainerProfileCacheImpl) GetContainerProfileState(containerID string) *objectcache.ProfileState { + if entry, ok := c.entries.Load(containerID); ok && entry != nil && entry.State != nil { + return entry.State + } + return &objectcache.ProfileState{ + Error: fmt.Errorf("container %s not found in container-profile cache", containerID), + } +} + +// GetCallStackSearchTree returns the cached call-stack index for a container, +// or nil if there is no entry or no tree. +func (c *ContainerProfileCacheImpl) GetCallStackSearchTree(containerID string) *callstackcache.CallStackSearchTree { + if entry, ok := c.entries.Load(containerID); ok && entry != nil { + return entry.CallStackTree + } + return nil +} + +// waitForSharedContainerData blocks until K8sObjectCache has shared data for +// the container (populated by containerwatcher) or ctx expires. +func (c *ContainerProfileCacheImpl) waitForSharedContainerData(containerID string, ctx context.Context) (*objectcache.WatchedContainerData, error) { + return backoff.Retry(ctx, func() (*objectcache.WatchedContainerData, error) { + if sharedData := c.k8sObjectCache.GetSharedContainerData(containerID); sharedData != nil { + return sharedData, nil + } + return nil, fmt.Errorf("container %s not found in shared data", containerID) + }, backoff.WithBackOff(backoff.NewExponentialBackOff())) +} + +// Ensure ContainerProfileCacheImpl implements the ContainerProfileCache interface. +var _ objectcache.ContainerProfileCache = (*ContainerProfileCacheImpl)(nil) diff --git a/pkg/objectcache/containerprofilecache/containerprofilecache_test.go b/pkg/objectcache/containerprofilecache/containerprofilecache_test.go new file mode 100644 index 0000000000..1cf039391d --- /dev/null +++ b/pkg/objectcache/containerprofilecache/containerprofilecache_test.go @@ -0,0 +1,331 @@ +package containerprofilecache + +import ( + "context" + "errors" + "testing" + "time" + + containercollection "github.com/inspektor-gadget/inspektor-gadget/pkg/container-collection" + eventtypes "github.com/inspektor-gadget/inspektor-gadget/pkg/types" + instanceidhandlerV1 "github.com/kubescape/k8s-interface/instanceidhandler/v1" + helpersv1 "github.com/kubescape/k8s-interface/instanceidhandler/v1/helpers" + "github.com/kubescape/node-agent/pkg/config" + "github.com/kubescape/node-agent/pkg/objectcache" + "github.com/kubescape/node-agent/pkg/storage" + "github.com/kubescape/storage/pkg/apis/softwarecomposition/v1beta1" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// fakeProfileClient is a minimal storage.ProfileClient stub for tests. It +// always returns the same CP pointer (so the fast-path can be asserted via +// pointer equality). +type fakeProfileClient struct { + cp *v1beta1.ContainerProfile + ap *v1beta1.ApplicationProfile // returned for Get by ap.Name match (or any if overlayOnly is empty) + nn *v1beta1.NetworkNeighborhood + cpErr error + apErr error + nnErr error + + // userManagedAP / userManagedNN, when non-nil, are returned for any + // GetApplicationProfile / GetNetworkNeighborhood whose name starts with + // the "ug-" prefix (the convention used by legacy user-managed profiles). + // This lets tests exercise the user-managed merge path added for + // Test_12_MergingProfilesTest / Test_13_MergingNetworkNeighborhoodTest + // without fighting the overlayOnly restriction. + userManagedAP *v1beta1.ApplicationProfile + userManagedNN *v1beta1.NetworkNeighborhood + + // overlayOnly, if non-empty, restricts ap/nn returns to only the given + // name; other names return (nil, nil). Tests that mix workload-AP/NN + // with overlay-AP/NN use this to keep the fixture scoped. + overlayOnly string + + getCPCalls int +} + +var _ storage.ProfileClient = (*fakeProfileClient)(nil) + +func (f *fakeProfileClient) GetApplicationProfile(_ context.Context, _, name string) (*v1beta1.ApplicationProfile, error) { + if len(name) >= 3 && name[:3] == helpersv1.UserApplicationProfilePrefix { + return f.userManagedAP, nil + } + if f.overlayOnly != "" && name != f.overlayOnly { + return nil, nil + } + return f.ap, f.apErr +} +func (f *fakeProfileClient) GetNetworkNeighborhood(_ context.Context, _, name string) (*v1beta1.NetworkNeighborhood, error) { + if len(name) >= 3 && name[:3] == helpersv1.UserNetworkNeighborhoodPrefix { + return f.userManagedNN, nil + } + if f.overlayOnly != "" && name != f.overlayOnly { + return nil, nil + } + return f.nn, f.nnErr +} +func (f *fakeProfileClient) GetContainerProfile(_ context.Context, _, _ string) (*v1beta1.ContainerProfile, error) { + f.getCPCalls++ + return f.cp, f.cpErr +} +func (f *fakeProfileClient) ListApplicationProfiles(_ context.Context, _ string, _ int64, _ string) (*v1beta1.ApplicationProfileList, error) { + return &v1beta1.ApplicationProfileList{}, nil +} +func (f *fakeProfileClient) ListNetworkNeighborhoods(_ context.Context, _ string, _ int64, _ string) (*v1beta1.NetworkNeighborhoodList, error) { + return &v1beta1.NetworkNeighborhoodList{}, nil +} + +// newTestCache returns a cache wired with an in-memory K8sObjectCacheMock. +func newTestCache(t *testing.T, client storage.ProfileClient) (*ContainerProfileCacheImpl, *objectcache.K8sObjectCacheMock) { + t.Helper() + k8s := &objectcache.K8sObjectCacheMock{} + cfg := config.Config{ProfilesCacheRefreshRate: 30 * time.Second} + return NewContainerProfileCache(cfg, client, k8s, nil), k8s +} + +// primeSharedData stashes a WatchedContainerData so waitForSharedContainerData +// resolves instantly. It builds a real InstanceID from a pod because the cache +// code calls .GetOneTimeSlug and .GetTemplateHash on it. +func primeSharedData(t *testing.T, k8s *objectcache.K8sObjectCacheMock, containerID, wlid string) { + t.Helper() + ids, err := instanceidhandlerV1.GenerateInstanceIDFromPod(&corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "nginx-abc", Namespace: "default"}, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{{Name: "nginx", Image: "nginx:1.25"}}, + }, + Status: corev1.PodStatus{ + ContainerStatuses: []corev1.ContainerStatus{{Name: "nginx", ImageID: "sha256:deadbeef"}}, + }, + }) + require.NoError(t, err) + require.NotEmpty(t, ids) + k8s.SetSharedContainerData(containerID, &objectcache.WatchedContainerData{ + InstanceID: ids[0], + Wlid: wlid, + }) +} + +// eventContainer returns a minimal *containercollection.Container. +func eventContainer(id string) *containercollection.Container { + return &containercollection.Container{ + Runtime: containercollection.RuntimeMetadata{BasicRuntimeMetadata: eventtypes.BasicRuntimeMetadata{ + ContainerID: id, + ContainerName: "nginx", + ContainerPID: 42, + }}, + K8s: containercollection.K8sMetadata{BasicK8sMetadata: eventtypes.BasicK8sMetadata{ + Namespace: "default", + PodName: "nginx-abc", + }}, + } +} + +// TestSharedFastPath_NoOverlay verifies that two separate add calls for the +// same CP yield entries that share the very same *ContainerProfile pointer. +func TestSharedFastPath_NoOverlay(t *testing.T) { + cp := &v1beta1.ContainerProfile{ + ObjectMeta: metav1.ObjectMeta{ + Name: "cp-shared", + Namespace: "default", + ResourceVersion: "7", + Annotations: map[string]string{ + helpersv1.CompletionMetadataKey: helpersv1.Full, + helpersv1.StatusMetadataKey: helpersv1.Completed, + }, + }, + Spec: v1beta1.ContainerProfileSpec{ + Capabilities: []string{"NET_ADMIN"}, + }, + } + client := &fakeProfileClient{cp: cp} + c, k8s := newTestCache(t, client) + + ids := []string{"container-id-A", "container-id-B"} + for _, id := range ids { + primeSharedData(t, k8s, id, "wlid://cluster-a/namespace-default/deployment-nginx") + require.NoError(t, c.addContainer(eventContainer(id), context.Background())) + } + + entryA, okA := c.entries.Load(ids[0]) + entryB, okB := c.entries.Load(ids[1]) + require.True(t, okA) + require.True(t, okB) + assert.True(t, entryA.Shared, "fast path must mark entry Shared=true") + assert.True(t, entryB.Shared, "fast path must mark entry Shared=true") + assert.Same(t, entryA.Profile, entryB.Profile, "both entries must share the same storage-fetched pointer") + assert.Same(t, cp, entryA.Profile, "fast path must not DeepCopy") +} + +// TestOverlayPath_DeepCopies verifies that when userAP is present we build a +// distinct DeepCopy (pointer inequality with the storage-fetched cp) and mark +// Shared=false. +func TestOverlayPath_DeepCopies(t *testing.T) { + cp := &v1beta1.ContainerProfile{ + ObjectMeta: metav1.ObjectMeta{Name: "cp-1", Namespace: "default", ResourceVersion: "1"}, + Spec: v1beta1.ContainerProfileSpec{Capabilities: []string{"SYS_PTRACE"}}, + } + userAP := &v1beta1.ApplicationProfile{ + ObjectMeta: metav1.ObjectMeta{Name: "override", Namespace: "default", ResourceVersion: "u1"}, + Spec: v1beta1.ApplicationProfileSpec{ + Containers: []v1beta1.ApplicationProfileContainer{{ + Name: "nginx", + Capabilities: []string{"NET_BIND_SERVICE"}, + }}, + }, + } + client := &fakeProfileClient{cp: cp, ap: userAP, overlayOnly: "override"} + c, k8s := newTestCache(t, client) + + id := "container-overlay" + primeSharedData(t, k8s, id, "wlid://cluster-a/namespace-default/deployment-nginx") + + ev := eventContainer(id) + ev.K8s.PodLabels = map[string]string{helpersv1.UserDefinedProfileMetadataKey: "override"} + require.NoError(t, c.addContainer(ev, context.Background())) + + entry, ok := c.entries.Load(id) + require.True(t, ok) + assert.False(t, entry.Shared, "overlay path must mark Shared=false") + assert.NotSame(t, cp, entry.Profile, "overlay path must DeepCopy, not share") + // Merged caps: base + user + assert.ElementsMatch(t, []string{"SYS_PTRACE", "NET_BIND_SERVICE"}, entry.Profile.Spec.Capabilities) + require.NotNil(t, entry.UserAPRef) + assert.Equal(t, "override", entry.UserAPRef.Name) + assert.Equal(t, "u1", entry.UserAPRV) +} + +// TestDeleteContainer_LockAndCleanup verifies that deleteContainer removes +// the entry and releases the per-container lock so a later Add re-uses a +// fresh mutex. +func TestDeleteContainer_LockAndCleanup(t *testing.T) { + cp := &v1beta1.ContainerProfile{ + ObjectMeta: metav1.ObjectMeta{Name: "cp-delete", Namespace: "default", ResourceVersion: "1"}, + } + client := &fakeProfileClient{cp: cp} + c, k8s := newTestCache(t, client) + + id := "container-delete" + primeSharedData(t, k8s, id, "wlid://x") + require.NoError(t, c.addContainer(eventContainer(id), context.Background())) + require.True(t, c.containerLocks.HasLock(id), "lock should exist after add") + require.NotNil(t, c.GetContainerProfile(id)) + + c.deleteContainer(id) + assert.Nil(t, c.GetContainerProfile(id), "entry must be gone after delete") + // Phase-4 review fix: deleteContainer intentionally does NOT release the + // lock to avoid a race where a concurrent addContainer could hold a + // reference to a mutex that another caller re-creates after Delete. + // Memory cost is bounded by live+recently-deleted container IDs. + assert.True(t, c.containerLocks.HasLock(id), "lock is retained by design after delete") +} + +// TestContainerCallback_IgnoredContainer verifies IgnoreContainer short-circuits +// before any storage call is issued. +func TestContainerCallback_IgnoredContainer(t *testing.T) { + cp := &v1beta1.ContainerProfile{ObjectMeta: metav1.ObjectMeta{Name: "cp", Namespace: "default", ResourceVersion: "1"}} + client := &fakeProfileClient{cp: cp} + c, _ := newTestCache(t, client) + c.cfg.ExcludeNamespaces = []string{"kube-system"} + + ev := containercollection.PubSubEvent{ + Type: containercollection.EventTypeAddContainer, + Container: &containercollection.Container{ + Runtime: containercollection.RuntimeMetadata{BasicRuntimeMetadata: eventtypes.BasicRuntimeMetadata{ + ContainerID: "ignored", ContainerPID: 42, ContainerName: "c", + }}, + K8s: containercollection.K8sMetadata{BasicK8sMetadata: eventtypes.BasicK8sMetadata{ + Namespace: "kube-system", PodName: "p", + }}, + }, + } + c.ContainerCallback(ev) + // Allow any mistakenly-spawned goroutine a brief window — none should run. + time.Sleep(20 * time.Millisecond) + assert.Equal(t, 0, client.getCPCalls, "IgnoreContainer must short-circuit before any storage call") +} + +// TestContainerCallback_HostContainer verifies that host containers do NOT +// trigger IgnoreContainer even when their namespace is in ExcludeNamespaces +// (host events carry namespace="host" after override, not the original one). +func TestContainerCallback_HostContainer(t *testing.T) { + cp := &v1beta1.ContainerProfile{ObjectMeta: metav1.ObjectMeta{Name: "cp", Namespace: "host", ResourceVersion: "1"}} + client := &fakeProfileClient{cp: cp} + c, _ := newTestCache(t, client) + // Even with every namespace excluded, host containers bypass the check. + c.cfg.ExcludeNamespaces = []string{"default", "host"} + + hostContainer := &containercollection.Container{ + Runtime: containercollection.RuntimeMetadata{BasicRuntimeMetadata: eventtypes.BasicRuntimeMetadata{ + ContainerID: "host-c", ContainerPID: 1, ContainerName: "host", + }}, + K8s: containercollection.K8sMetadata{BasicK8sMetadata: eventtypes.BasicK8sMetadata{ + Namespace: "default", PodName: "", + }}, + } + c.ContainerCallback(containercollection.PubSubEvent{Type: containercollection.EventTypeAddContainer, Container: hostContainer}) + // The callback dispatches a goroutine that will stall on backoff (no + // shared data is primed) — we only assert the callback returns without + // panic and did not short-circuit on IgnoreContainer. We cannot assert + // storage was called without racing the backoff; just confirm no panic. + time.Sleep(20 * time.Millisecond) +} + +// TestCallStackIndexBuiltFromProfile verifies that the call-stack tree is +// populated from CP.Spec.IdentifiedCallStacks and retrievable via +// GetCallStackSearchTree. +func TestCallStackIndexBuiltFromProfile(t *testing.T) { + cp := &v1beta1.ContainerProfile{ + ObjectMeta: metav1.ObjectMeta{Name: "cp-stack", Namespace: "default", ResourceVersion: "1"}, + Spec: v1beta1.ContainerProfileSpec{ + IdentifiedCallStacks: []v1beta1.IdentifiedCallStack{ + { + CallID: "r1", + CallStack: v1beta1.CallStack{Root: v1beta1.CallStackNode{ + Frame: v1beta1.StackFrame{FileID: "f1", Lineno: "10"}, + Children: []v1beta1.CallStackNode{ + {Frame: v1beta1.StackFrame{FileID: "f2", Lineno: "20"}}, + }, + }}, + }, + }, + }, + } + client := &fakeProfileClient{cp: cp} + c, k8s := newTestCache(t, client) + + id := "c-stack" + primeSharedData(t, k8s, id, "wlid://x") + require.NoError(t, c.addContainer(eventContainer(id), context.Background())) + + tree := c.GetCallStackSearchTree(id) + require.NotNil(t, tree) + require.NotNil(t, tree.PathsByCallID) + _, hasCallID := tree.PathsByCallID["r1"] + assert.True(t, hasCallID, "call-stack tree must contain CallID 'r1' from CP") +} + +// TestGetContainerProfile_Miss sanity-checks the nil path returns nil and a +// synthetic error ProfileState (no panic). +func TestGetContainerProfile_Miss(t *testing.T) { + c, _ := newTestCache(t, &fakeProfileClient{}) + assert.Nil(t, c.GetContainerProfile("nope")) + state := c.GetContainerProfileState("nope") + require.NotNil(t, state) + require.Error(t, state.Error) +} + +// TestStorageError_NoEntry ensures storage errors don't panic and don't +// populate a cache entry. +func TestStorageError_NoEntry(t *testing.T) { + client := &fakeProfileClient{cpErr: errors.New("kaboom")} + c, k8s := newTestCache(t, client) + id := "c-err" + primeSharedData(t, k8s, id, "wlid://x") + require.NoError(t, c.addContainer(eventContainer(id), context.Background())) + _, ok := c.entries.Load(id) + assert.False(t, ok, "storage error must not create a cache entry") +} diff --git a/pkg/objectcache/containerprofilecache/export_test.go b/pkg/objectcache/containerprofilecache/export_test.go new file mode 100644 index 0000000000..c5277665c0 --- /dev/null +++ b/pkg/objectcache/containerprofilecache/export_test.go @@ -0,0 +1,50 @@ +package containerprofilecache + +// export_test.go exposes internal symbols to the containerprofilecache_test +// package (the *_test.go files in this directory). Compiled only during +// `go test`; never included in the production binary. + +import "context" + +func (c *ContainerProfileCacheImpl) ReconcileOnce(ctx context.Context) { + c.reconcileOnce(ctx) +} + +func (c *ContainerProfileCacheImpl) SeedEntryForTest(containerID string, entry *CachedContainerProfile) { + c.entries.Set(containerID, entry) +} + +func (c *ContainerProfileCacheImpl) RefreshAllEntriesForTest(ctx context.Context) { + c.refreshAllEntries(ctx) +} + +// WarmContainerLocksForTest acquires and immediately releases each container +// lock, initialising the internal SafeMap before the concurrent phase to avoid +// the goradd/maps nil-check-before-lock initialisation race (SafeMap v1.3.0). +func (c *ContainerProfileCacheImpl) WarmContainerLocksForTest(ids []string) { + for _, id := range ids { + c.containerLocks.WithLock(id, func() {}) + } +} + +// WarmPendingForTest initialises the pending SafeMap via a Set+Delete cycle +// for each id, preventing the goradd/maps nil-check-before-lock race in +// SafeMap.Len / SafeMap.Delete during concurrent test phases. +func (c *ContainerProfileCacheImpl) WarmPendingForTest(ids []string) { + for _, id := range ids { + c.pending.Set(id, nil) + c.pending.Delete(id) + } +} + +// SeedEntryWithOverlayForTest seeds an entry with user AP and NN overlay refs. +// Pass empty strings to leave a ref nil. +func (c *ContainerProfileCacheImpl) SeedEntryWithOverlayForTest(containerID string, entry *CachedContainerProfile, apNS, apName, nnNS, nnName string) { + if apName != "" { + entry.UserAPRef = &namespacedName{Namespace: apNS, Name: apName} + } + if nnName != "" { + entry.UserNNRef = &namespacedName{Namespace: nnNS, Name: nnName} + } + c.entries.Set(containerID, entry) +} diff --git a/pkg/objectcache/containerprofilecache/init_eviction_test.go b/pkg/objectcache/containerprofilecache/init_eviction_test.go new file mode 100644 index 0000000000..b7f3535603 --- /dev/null +++ b/pkg/objectcache/containerprofilecache/init_eviction_test.go @@ -0,0 +1,154 @@ +package containerprofilecache_test + +import ( + "context" + "testing" + "time" + + containercollection "github.com/inspektor-gadget/inspektor-gadget/pkg/container-collection" + "github.com/kubescape/node-agent/pkg/config" + "github.com/kubescape/node-agent/pkg/objectcache" + cpc "github.com/kubescape/node-agent/pkg/objectcache/containerprofilecache" + "github.com/stretchr/testify/assert" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "github.com/kubescape/storage/pkg/apis/softwarecomposition/v1beta1" +) + +// newCPCForEvictionTest wires up a ContainerProfileCacheImpl with the provided +// storage and k8s stubs for eviction testing. Start is NOT called so the +// reconciler goroutine never runs — tests drive ReconcileOnce directly. +func newCPCForEvictionTest(storage *stubStorage, k8s *stubK8sCache) *cpc.ContainerProfileCacheImpl { + cfg := config.Config{ProfilesCacheRefreshRate: 30 * time.Second} + return cpc.NewContainerProfileCache(cfg, storage, k8s, nil) +} + +// seedEntry builds and seeds a minimal CachedContainerProfile into the cache +// using the exported SeedEntryForTest hook. +func seedEntry(cache *cpc.ContainerProfileCacheImpl, containerID string, cp *v1beta1.ContainerProfile, containerName, podName, namespace, podUID string) { + entry := &cpc.CachedContainerProfile{ + Profile: cp, + State: &objectcache.ProfileState{Name: cp.Name}, + ContainerName: containerName, + PodName: podName, + Namespace: namespace, + PodUID: podUID, + CPName: cp.Name, + RV: cp.ResourceVersion, + Shared: true, + } + cache.SeedEntryForTest(containerID, entry) +} + +// TestInitContainerEvictionViaRemoveEvent — T2a. +// +// Pod has 1 init container (initID) + 1 regular container (regID), both seeded +// into the cache. Fire EventTypeRemoveContainer for the init container via +// ContainerCallback. Assert that the init entry is evicted and the regular +// entry is untouched. +func TestInitContainerEvictionViaRemoveEvent(t *testing.T) { + const ( + namespace = "default" + podName = "testpod" + initID = "init-container-id" + regID = "regular-container-id" + initName = "init-container" + regularName = "regular" + podUID = "pod-uid-t2a" + ) + + cp := &v1beta1.ContainerProfile{ + ObjectMeta: metav1.ObjectMeta{ + Name: "cp-test", + Namespace: namespace, + ResourceVersion: "1", + }, + } + store := newFakeStorage(cp) + k8s := newFakeK8sCache() + cache := newCPCForEvictionTest(store, k8s) + + // Seed both containers directly — no goroutines, no races. + seedEntry(cache, initID, cp, initName, podName, namespace, podUID) + seedEntry(cache, regID, cp, regularName, podName, namespace, podUID) + + assert.NotNil(t, cache.GetContainerProfile(initID), "init container must be cached before eviction") + assert.NotNil(t, cache.GetContainerProfile(regID), "regular container must be cached before eviction") + + // Fire remove event for init container only. deleteContainer runs in a + // goroutine; wait for it to complete. + cache.ContainerCallback(containercollection.PubSubEvent{ + Type: containercollection.EventTypeRemoveContainer, + Container: makeTestContainer(initID, podName, namespace, initName), + }) + + // deleteContainer goroutine is very fast (just a map delete + lock release). + assert.Eventually(t, func() bool { + return cache.GetContainerProfile(initID) == nil + }, 3*time.Second, 10*time.Millisecond, "init container entry must be evicted after RemoveContainer event") + + // Regular container must survive. + assert.NotNil(t, cache.GetContainerProfile(regID), "regular container entry must remain after init eviction") +} + +// TestMissedRemoveEventEvictedByReconciler — T2b. +// +// Init container entry is seeded directly. Pod status is then flipped so the +// init container is no longer Running (simulating it finishing without a remove +// event). ReconcileOnce must evict the stale entry. +func TestMissedRemoveEventEvictedByReconciler(t *testing.T) { + const ( + namespace = "default" + podName = "testpod-reconcile" + initID = "init-container-reconcile" + initName = "init-container" + podUID = "pod-uid-reconcile" + ) + + cp := &v1beta1.ContainerProfile{ + ObjectMeta: metav1.ObjectMeta{ + Name: "cp-reconcile", + Namespace: namespace, + ResourceVersion: "1", + }, + } + store := newFakeStorage(cp) + k8s := newFakeK8sCache() + + // Start: pod shows init container Running. + runningPod := makeTestPod(podName, namespace, podUID, + nil, + []corev1.ContainerStatus{{ + Name: initName, + ContainerID: "containerd://" + initID, + State: corev1.ContainerState{Running: &corev1.ContainerStateRunning{}}, + }}, + ) + k8s.setPod(namespace, podName, runningPod) + + cache := newCPCForEvictionTest(store, k8s) + + // Seed init container entry directly. + seedEntry(cache, initID, cp, initName, podName, namespace, podUID) + assert.NotNil(t, cache.GetContainerProfile(initID), "init container must be seeded before reconciler test") + + // Simulate init container finishing: flip status to Terminated, no remove event. + terminatedPod := makeTestPod(podName, namespace, podUID, + nil, + []corev1.ContainerStatus{{ + Name: initName, + ContainerID: "containerd://" + initID, + State: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{ExitCode: 0}, + }, + }}, + ) + k8s.setPod(namespace, podName, terminatedPod) + + // Drive the reconciler directly — no tick loop running, no goroutines. + cache.ReconcileOnce(context.Background()) + + assert.Nil(t, cache.GetContainerProfile(initID), + "reconciler must evict init container entry when pod status shows Terminated") +} diff --git a/pkg/objectcache/containerprofilecache/integration_helpers_test.go b/pkg/objectcache/containerprofilecache/integration_helpers_test.go new file mode 100644 index 0000000000..4965f0c732 --- /dev/null +++ b/pkg/objectcache/containerprofilecache/integration_helpers_test.go @@ -0,0 +1,143 @@ +// Integration/acceptance tests for the ContainerProfile cache unification +// (plan v2 §2.7 + §2.8 step 9). Shared test helpers for this package. +package containerprofilecache_test + +import ( + "context" + "sync" + + containercollection "github.com/inspektor-gadget/inspektor-gadget/pkg/container-collection" + eventtypes "github.com/inspektor-gadget/inspektor-gadget/pkg/types" + "github.com/kubescape/node-agent/pkg/objectcache" + "github.com/kubescape/node-agent/pkg/storage" + "github.com/kubescape/storage/pkg/apis/softwarecomposition/v1beta1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" +) + +// makeTestContainer builds a minimal *containercollection.Container for use +// in ContainerCallback events. +func makeTestContainer(id, podName, namespace, containerName string) *containercollection.Container { + return &containercollection.Container{ + Runtime: containercollection.RuntimeMetadata{ + BasicRuntimeMetadata: eventtypes.BasicRuntimeMetadata{ + ContainerID: id, + ContainerName: containerName, + ContainerPID: 42, + }, + }, + K8s: containercollection.K8sMetadata{ + BasicK8sMetadata: eventtypes.BasicK8sMetadata{ + Namespace: namespace, + PodName: podName, + }, + }, + } +} + +// makeTestPod builds a *corev1.Pod with the provided container statuses. +func makeTestPod(name, namespace, uid string, containerStatuses []corev1.ContainerStatus, initStatuses []corev1.ContainerStatus) *corev1.Pod { + return &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + UID: types.UID(uid), + }, + Status: corev1.PodStatus{ + ContainerStatuses: containerStatuses, + InitContainerStatuses: initStatuses, + }, + } +} + +// stubStorage is a minimal storage.ProfileClient stub with settable responses. +type stubStorage struct { + mu sync.RWMutex + cp *v1beta1.ContainerProfile + ap *v1beta1.ApplicationProfile + nn *v1beta1.NetworkNeighborhood +} + +var _ storage.ProfileClient = (*stubStorage)(nil) + +func newFakeStorage(cp *v1beta1.ContainerProfile) *stubStorage { + return &stubStorage{cp: cp} +} + +func (s *stubStorage) GetContainerProfile(_ context.Context, _, _ string) (*v1beta1.ContainerProfile, error) { + s.mu.RLock() + defer s.mu.RUnlock() + return s.cp, nil +} + +func (s *stubStorage) GetApplicationProfile(_ context.Context, _, _ string) (*v1beta1.ApplicationProfile, error) { + s.mu.RLock() + defer s.mu.RUnlock() + return s.ap, nil +} + +func (s *stubStorage) GetNetworkNeighborhood(_ context.Context, _, _ string) (*v1beta1.NetworkNeighborhood, error) { + s.mu.RLock() + defer s.mu.RUnlock() + return s.nn, nil +} + +func (s *stubStorage) ListApplicationProfiles(_ context.Context, _ string, _ int64, _ string) (*v1beta1.ApplicationProfileList, error) { + return &v1beta1.ApplicationProfileList{}, nil +} + +func (s *stubStorage) ListNetworkNeighborhoods(_ context.Context, _ string, _ int64, _ string) (*v1beta1.NetworkNeighborhoodList, error) { + return &v1beta1.NetworkNeighborhoodList{}, nil +} + +// stubK8sCache is a controllable K8sObjectCache stub. +type stubK8sCache struct { + mu sync.RWMutex + pods map[string]*corev1.Pod + data map[string]*objectcache.WatchedContainerData +} + +var _ objectcache.K8sObjectCache = (*stubK8sCache)(nil) + +func newFakeK8sCache() *stubK8sCache { + return &stubK8sCache{ + pods: make(map[string]*corev1.Pod), + data: make(map[string]*objectcache.WatchedContainerData), + } +} + +func (k *stubK8sCache) setPod(namespace, podName string, pod *corev1.Pod) { + k.mu.Lock() + defer k.mu.Unlock() + k.pods[namespace+"/"+podName] = pod +} + +func (k *stubK8sCache) GetPod(namespace, podName string) *corev1.Pod { + k.mu.RLock() + defer k.mu.RUnlock() + return k.pods[namespace+"/"+podName] +} + +func (k *stubK8sCache) GetPodSpec(_, _ string) *corev1.PodSpec { return nil } +func (k *stubK8sCache) GetPodStatus(_, _ string) *corev1.PodStatus { return nil } +func (k *stubK8sCache) GetApiServerIpAddress() string { return "" } +func (k *stubK8sCache) GetPods() []*corev1.Pod { return nil } + +func (k *stubK8sCache) SetSharedContainerData(id string, d *objectcache.WatchedContainerData) { + k.mu.Lock() + defer k.mu.Unlock() + k.data[id] = d +} + +func (k *stubK8sCache) GetSharedContainerData(id string) *objectcache.WatchedContainerData { + k.mu.RLock() + defer k.mu.RUnlock() + return k.data[id] +} + +func (k *stubK8sCache) DeleteSharedContainerData(id string) { + k.mu.Lock() + defer k.mu.Unlock() + delete(k.data, id) +} diff --git a/pkg/objectcache/containerprofilecache/lock_stress_test.go b/pkg/objectcache/containerprofilecache/lock_stress_test.go new file mode 100644 index 0000000000..d690b94cf7 --- /dev/null +++ b/pkg/objectcache/containerprofilecache/lock_stress_test.go @@ -0,0 +1,200 @@ +package containerprofilecache_test + +import ( + "context" + "math/rand" + "runtime" + "sync" + "testing" + "time" + + containercollection "github.com/inspektor-gadget/inspektor-gadget/pkg/container-collection" + "github.com/kubescape/node-agent/pkg/config" + "github.com/kubescape/node-agent/pkg/objectcache" + cpc "github.com/kubescape/node-agent/pkg/objectcache/containerprofilecache" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + instanceidhandlerV1 "github.com/kubescape/k8s-interface/instanceidhandler/v1" + "github.com/kubescape/storage/pkg/apis/softwarecomposition/v1beta1" +) + +// TestLockStressAddEvictInterleaved — T7. +// +// 100 goroutines, each running 50 iterations of random seed/delete for a pool +// of 10 container IDs. Uses SeedEntryForTest + deleteContainer (via +// EventTypeRemoveContainer → deleteContainer path) to test the cache's +// per-container locking under concurrent interleaved add/evict. +// +// NOTE on race detector: goradd/maps v1.3.0 has a pre-existing data race in +// SafeMap.Load / SafeMap.Len (nil-check outside the read-lock vs Set +// initialization write). This race is present in pkg/resourcelocks own tests +// (TestConcurrentMultipleContainers fails with -race even before this commit). +// To avoid triggering that upstream race, all SafeMap instances are +// pre-warmed (via SeedEntryForTest) before the concurrent phase starts. +func TestLockStressAddEvictInterleaved(t *testing.T) { + const ( + namespace = "default" + podName = "stress-pod" + podUID = "stress-pod-uid" + numWorkers = 100 + numIters = 50 + poolSize = 10 + wlid = "wlid://cluster-test/namespace-default/deployment-stress" + ) + + cp := &v1beta1.ContainerProfile{ + ObjectMeta: metav1.ObjectMeta{ + Name: "cp-stress", + Namespace: namespace, + ResourceVersion: "1", + }, + } + store := newFakeStorage(cp) + k8s := newFakeK8sCache() + + // Prime shared data for each container in the pool so that the internal + // waitForSharedContainerData path resolves if needed. + containerIDs := make([]string, poolSize) + for i := 0; i < poolSize; i++ { + id := "stress-container-" + itoa3(i) + containerIDs[i] = id + primeSharedDataForStress(t, k8s, id, podName, namespace, "container-"+itoa3(i), wlid) + } + + cfg := config.Config{ProfilesCacheRefreshRate: 30 * time.Second} + // Start is NOT called — no background reconciler goroutine runs. + cache := cpc.NewContainerProfileCache(cfg, store, k8s, nil) + + // Pre-warm all internal SafeMap instances before the concurrent phase to + // avoid triggering the goradd/maps nil-check-before-lock initialization + // race (pre-existing upstream bug in SafeMap.Load / SafeMap.Len). + // WarmContainerLocksForTest pre-initialises the containerLocks SafeMap; + // SeedEntryForTest pre-initialises the entries SafeMap; + // WarmPendingForTest pre-initialises the pending SafeMap (touched by + // deleteContainer via ContainerCallback(EventTypeRemoveContainer)). + cache.WarmContainerLocksForTest(containerIDs) + cache.WarmPendingForTest(containerIDs) + for _, id := range containerIDs { + cache.SeedEntryForTest(id, &cpc.CachedContainerProfile{ + Profile: cp, + State: &objectcache.ProfileState{Name: cp.Name}, + ContainerName: "container", + PodName: podName, + Namespace: namespace, + PodUID: podUID, + CPName: cp.Name, + RV: cp.ResourceVersion, + Shared: true, + }) + } + + baseline := runtime.NumGoroutine() + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + var wg sync.WaitGroup + wg.Add(numWorkers) + for w := 0; w < numWorkers; w++ { + go func(worker int) { + defer wg.Done() + r := rand.New(rand.NewSource(time.Now().UnixNano() + int64(worker))) + for iter := 0; iter < numIters; iter++ { + if ctx.Err() != nil { + return + } + id := containerIDs[r.Intn(poolSize)] + if r.Intn(2) == 0 { + // Add path: seed entry directly (no goroutine spawn, + // no backoff, no storage RPC — pure lock stress). + cache.SeedEntryForTest(id, &cpc.CachedContainerProfile{ + Profile: cp, + State: &objectcache.ProfileState{Name: cp.Name}, + ContainerName: "container", + PodName: podName, + Namespace: namespace, + PodUID: podUID, + CPName: cp.Name, + RV: cp.ResourceVersion, + Shared: true, + }) + } else { + // Evict path: use the production remove-event path so + // deleteContainer and per-container locking are exercised. + cache.ContainerCallback(containercollection.PubSubEvent{ + Type: containercollection.EventTypeRemoveContainer, + Container: makeTestContainer(id, podName, namespace, "container"), + }) + } + time.Sleep(time.Millisecond * time.Duration(r.Intn(2))) + } + }(w) + } + + done := make(chan struct{}) + go func() { + wg.Wait() + close(done) + }() + + select { + case <-done: + // all goroutines finished within budget + case <-ctx.Done(): + t.Fatal("TestLockStressAddEvictInterleaved timed out after 5s") + } + + // ContainerCallback(EventTypeRemoveContainer) spawns go deleteContainer(...) + // asynchronously, so those goroutines may still be running immediately after + // wg.Wait(). Poll briefly until they drain before asserting goroutine count. + drainDeadline := time.Now().Add(200 * time.Millisecond) + for runtime.NumGoroutine() > baseline+10 && time.Now().Before(drainDeadline) { + runtime.Gosched() + time.Sleep(5 * time.Millisecond) + } + runtime.GC() + assert.LessOrEqual(t, runtime.NumGoroutine(), baseline+10, + "goroutine count should stay near baseline (no leaked goroutines)") + + // Implicit: if any goroutine panicked the test would have already failed. + assert.True(t, true, "no panic occurred") +} + +// primeSharedDataForStress primes shared data for a container used in the +// stress test. +func primeSharedDataForStress(t *testing.T, k8s *stubK8sCache, containerID, podName, namespace, containerName, wlid string) { + t.Helper() + ids, err := instanceidhandlerV1.GenerateInstanceIDFromPod(&corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: podName, Namespace: namespace}, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{{Name: containerName, Image: "nginx:1.25"}}, + }, + Status: corev1.PodStatus{ + ContainerStatuses: []corev1.ContainerStatus{{Name: containerName, ImageID: "sha256:deadbeef"}}, + }, + }) + require.NoError(t, err) + require.NotEmpty(t, ids) + k8s.SetSharedContainerData(containerID, &objectcache.WatchedContainerData{ + InstanceID: ids[0], + Wlid: wlid, + }) +} + +// itoa3 converts a small non-negative int to a string without strconv. +func itoa3(i int) string { + if i == 0 { + return "0" + } + buf := [10]byte{} + pos := len(buf) + for i > 0 { + pos-- + buf[pos] = byte('0' + i%10) + i /= 10 + } + return string(buf[pos:]) +} diff --git a/pkg/objectcache/containerprofilecache/metrics.go b/pkg/objectcache/containerprofilecache/metrics.go new file mode 100644 index 0000000000..3a3a48cee7 --- /dev/null +++ b/pkg/objectcache/containerprofilecache/metrics.go @@ -0,0 +1,66 @@ +package containerprofilecache + +import ( + "fmt" + + "github.com/kubescape/go-logger" + "github.com/kubescape/go-logger/helpers" + "github.com/kubescape/storage/pkg/apis/softwarecomposition/v1beta1" +) + +// Kind labels for ReportContainerProfileLegacyLoad and related metrics. +const ( + kindApplication = "application" + kindNetwork = "network" + + completenessFull = "full" + completenessPartial = "partial" +) + +// reportDeprecationWarn emits a one-shot WARN log for a user-authored legacy +// CRD (ApplicationProfile or NetworkNeighborhood) that was merged into the +// ContainerProfile. Dedup key is (kind, namespace, name, resourceVersion) so a +// single RV only logs once per process lifetime, even across many containers. +func (c *ContainerProfileCacheImpl) reportDeprecationWarn(kind, namespace, name, rv string, reason string) { + key := fmt.Sprintf("%s|%s/%s@%s", kind, namespace, name, rv) + if _, already := c.deprecationDedup.LoadOrStore(key, struct{}{}); already { + return + } + logger.L().Warning("ContainerProfileCache - user-authored legacy profile merged (deprecated)", + helpers.String("kind", kind), + helpers.String("namespace", namespace), + helpers.String("name", name), + helpers.String("resourceVersion", rv), + helpers.String("reason", reason)) +} + +// emitOverlayMetrics fires the per-kind completeness metric + deprecation WARN +// once per (kind, namespace, name, rv). Shared by addContainer's buildEntry +// and the reconciler's rebuildEntry so the two stay in lockstep. +func (c *ContainerProfileCacheImpl) emitOverlayMetrics( + userAP *v1beta1.ApplicationProfile, + userNN *v1beta1.NetworkNeighborhood, + warnings []partialProfileWarning, +) { + partialByKind := map[string]struct{}{} + for _, w := range warnings { + partialByKind[w.Kind] = struct{}{} + c.metricsManager.ReportContainerProfileLegacyLoad(w.Kind, completenessPartial) + c.reportDeprecationWarn(w.Kind, w.Namespace, w.Name, w.ResourceVersion, + fmt.Sprintf("pod has containers missing from user CRD: %v", w.MissingContainers)) + } + if userAP != nil { + if _, partial := partialByKind[kindApplication]; !partial { + c.metricsManager.ReportContainerProfileLegacyLoad(kindApplication, completenessFull) + } + c.reportDeprecationWarn(kindApplication, userAP.Namespace, userAP.Name, userAP.ResourceVersion, + "user-authored ApplicationProfile merged into ContainerProfile") + } + if userNN != nil { + if _, partial := partialByKind[kindNetwork]; !partial { + c.metricsManager.ReportContainerProfileLegacyLoad(kindNetwork, completenessFull) + } + c.reportDeprecationWarn(kindNetwork, userNN.Namespace, userNN.Name, userNN.ResourceVersion, + "user-authored NetworkNeighborhood merged into ContainerProfile") + } +} diff --git a/pkg/objectcache/containerprofilecache/packages_deleted_test.go b/pkg/objectcache/containerprofilecache/packages_deleted_test.go new file mode 100644 index 0000000000..3396e56d4c --- /dev/null +++ b/pkg/objectcache/containerprofilecache/packages_deleted_test.go @@ -0,0 +1,73 @@ +package containerprofilecache_test + +import ( + "strings" + "testing" + + "golang.org/x/tools/go/packages" +) + +// TestLegacyPackagesDeleted — T5. +// +// Walks the full dependency graph of ./... and asserts that neither of the +// deleted legacy cache packages appears as a reachable import path. Any +// surviving importer is listed in the failure message. +func TestLegacyPackagesDeleted(t *testing.T) { + const ( + legacyAP = "github.com/kubescape/node-agent/pkg/objectcache/applicationprofilecache" + legacyNN = "github.com/kubescape/node-agent/pkg/objectcache/networkneighborhoodcache" + ) + + cfg := &packages.Config{ + Mode: packages.NeedName | packages.NeedImports | packages.NeedDeps, + // Load from the module root so that ./... expands correctly. + Dir: "../../..", + } + + pkgs, err := packages.Load(cfg, "./...") + if err != nil { + t.Fatalf("packages.Load failed: %v", err) + } + + // Collect errors from the package loader (missing modules, parse errors, …). + var loadErrs []string + packages.Visit(pkgs, nil, func(p *packages.Package) { + for _, e := range p.Errors { + loadErrs = append(loadErrs, e.Msg) + } + }) + if len(loadErrs) > 0 { + // Non-fatal: the loader often emits spurious CGO / build-tag errors on + // CI. We only fail if we can't inspect any packages at all. + t.Logf("packages.Load reported %d non-fatal errors (first: %s)", len(loadErrs), loadErrs[0]) + } + + if len(pkgs) == 0 { + t.Fatal("packages.Load returned no packages — cannot verify legacy-path absence") + } + + // Build import-path → importing package map for the two legacy paths. + importers := map[string][]string{ + legacyAP: {}, + legacyNN: {}, + } + + packages.Visit(pkgs, func(p *packages.Package) bool { + for importPath := range p.Imports { + if importPath == legacyAP { + importers[legacyAP] = append(importers[legacyAP], p.PkgPath) + } + if importPath == legacyNN { + importers[legacyNN] = append(importers[legacyNN], p.PkgPath) + } + } + return true + }, nil) + + for legacy, importerList := range importers { + if len(importerList) > 0 { + t.Errorf("legacy package %q is still imported by:\n %s", + legacy, strings.Join(importerList, "\n ")) + } + } +} diff --git a/pkg/objectcache/containerprofilecache/projection.go b/pkg/objectcache/containerprofilecache/projection.go new file mode 100644 index 0000000000..1ff1bd1032 --- /dev/null +++ b/pkg/objectcache/containerprofilecache/projection.go @@ -0,0 +1,339 @@ +package containerprofilecache + +import ( + "github.com/kubescape/node-agent/pkg/utils" + "github.com/kubescape/storage/pkg/apis/softwarecomposition/v1beta1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// partialProfileWarning describes a user-authored legacy CRD that couldn't be +// fully merged into the ContainerProfile (e.g. the user CRD is missing entries +// for containers that exist in the pod spec). Emitted by the cache at merge +// time for deprecation observability. +type partialProfileWarning struct { + Kind string // "application" | "network" + Namespace string + Name string + ResourceVersion string + MissingContainers []string +} + +// projectUserProfiles overlays a user-authored ApplicationProfile and/or +// NetworkNeighborhood onto a base ContainerProfile for a single container. +// Returns a DeepCopy of the base with user fields merged in and a list of +// partial-merge warnings when the user CRD doesn't cover every container in +// the pod spec. +// +// cp MUST be non-nil. Either (or both) of userAP / userNN may be nil; nil +// user inputs contribute no merge but also no warning. pod may be nil, in +// which case the missing-container check is skipped (but the name-based +// per-container merge still runs). +func projectUserProfiles( + cp *v1beta1.ContainerProfile, + userAP *v1beta1.ApplicationProfile, + userNN *v1beta1.NetworkNeighborhood, + pod *corev1.Pod, + containerName string, +) (projected *v1beta1.ContainerProfile, warnings []partialProfileWarning) { + projected = cp.DeepCopy() + + if userAP != nil { + if missing := mergeApplicationProfile(projected, userAP, pod, containerName); len(missing) > 0 { + warnings = append(warnings, partialProfileWarning{ + Kind: kindApplication, + Namespace: userAP.Namespace, + Name: userAP.Name, + ResourceVersion: userAP.ResourceVersion, + MissingContainers: missing, + }) + } + } + + if userNN != nil { + if missing := mergeNetworkNeighborhood(projected, userNN, pod, containerName); len(missing) > 0 { + warnings = append(warnings, partialProfileWarning{ + Kind: kindNetwork, + Namespace: userNN.Namespace, + Name: userNN.Name, + ResourceVersion: userNN.ResourceVersion, + MissingContainers: missing, + }) + } + } + + return projected, warnings +} + +// mergeApplicationProfile finds the container entry in userAP matching +// containerName (across Spec.Containers / InitContainers / EphemeralContainers) +// and merges its fields into projected.Spec. Returns the list of pod-spec +// container names that are not present anywhere in userAP.Spec. +// +// ported from pkg/objectcache/applicationprofilecache/applicationprofilecache.go:660-673 +// (mergeContainer), applied here to a single-container ContainerProfile +// instead of a full ApplicationProfile. +func mergeApplicationProfile(projected *v1beta1.ContainerProfile, userAP *v1beta1.ApplicationProfile, pod *corev1.Pod, containerName string) []string { + // Defensive copy: slices inside matched (e.g. Execs[i].Args, Opens[i].Flags, + // Endpoints[i].Methods) would otherwise alias the caller's CRD object and + // could change if the CRD is refreshed concurrently. + userAP = userAP.DeepCopy() + if matched := findUserAPContainer(userAP, containerName); matched != nil { + projected.Spec.Capabilities = append(projected.Spec.Capabilities, matched.Capabilities...) + projected.Spec.Execs = append(projected.Spec.Execs, matched.Execs...) + projected.Spec.Opens = append(projected.Spec.Opens, matched.Opens...) + projected.Spec.Syscalls = append(projected.Spec.Syscalls, matched.Syscalls...) + projected.Spec.Endpoints = append(projected.Spec.Endpoints, matched.Endpoints...) + if projected.Spec.PolicyByRuleId == nil && len(matched.PolicyByRuleId) > 0 { + projected.Spec.PolicyByRuleId = make(map[string]v1beta1.RulePolicy, len(matched.PolicyByRuleId)) + } + for k, v := range matched.PolicyByRuleId { + if existing, ok := projected.Spec.PolicyByRuleId[k]; ok { + projected.Spec.PolicyByRuleId[k] = utils.MergePolicies(existing, v) + } else { + projected.Spec.PolicyByRuleId[k] = v + } + } + } + + return missingPodContainers(pod, userAPNames(userAP)) +} + +// mergeNetworkNeighborhood finds the container entry in userNN matching +// containerName and merges its Ingress/Egress into projected.Spec, then +// overlays the user CRD's pod LabelSelector onto projected's embedded +// LabelSelector. Returns missing-from-userNN pod container names. +// +// ported from pkg/objectcache/networkneighborhoodcache/networkneighborhoodcache.go:560-636 +// (performMerge, mergeContainer, mergeNetworkNeighbors) applied to a single +// container's rules on a ContainerProfile. +func mergeNetworkNeighborhood(projected *v1beta1.ContainerProfile, userNN *v1beta1.NetworkNeighborhood, pod *corev1.Pod, containerName string) []string { + // Defensive copy: neighbor slices (DNSNames, Ports, MatchExpressions) and + // LabelSelector.MatchExpressions would otherwise alias the caller's CRD. + userNN = userNN.DeepCopy() + if matched := findUserNNContainer(userNN, containerName); matched != nil { + projected.Spec.Ingress = mergeNetworkNeighbors(projected.Spec.Ingress, matched.Ingress) + projected.Spec.Egress = mergeNetworkNeighbors(projected.Spec.Egress, matched.Egress) + } + + // Merge LabelSelector (ContainerProfileSpec embeds metav1.LabelSelector). + if userNN.Spec.LabelSelector.MatchLabels != nil { + if projected.Spec.LabelSelector.MatchLabels == nil { + projected.Spec.LabelSelector.MatchLabels = make(map[string]string) + } + for k, v := range userNN.Spec.LabelSelector.MatchLabels { + projected.Spec.LabelSelector.MatchLabels[k] = v + } + } + projected.Spec.LabelSelector.MatchExpressions = append( + projected.Spec.LabelSelector.MatchExpressions, + userNN.Spec.LabelSelector.MatchExpressions..., + ) + + return missingPodContainers(pod, userNNNames(userNN)) +} + +func findUserAPContainer(userAP *v1beta1.ApplicationProfile, containerName string) *v1beta1.ApplicationProfileContainer { + if userAP == nil { + return nil + } + for i := range userAP.Spec.Containers { + if userAP.Spec.Containers[i].Name == containerName { + return &userAP.Spec.Containers[i] + } + } + for i := range userAP.Spec.InitContainers { + if userAP.Spec.InitContainers[i].Name == containerName { + return &userAP.Spec.InitContainers[i] + } + } + for i := range userAP.Spec.EphemeralContainers { + if userAP.Spec.EphemeralContainers[i].Name == containerName { + return &userAP.Spec.EphemeralContainers[i] + } + } + return nil +} + +func findUserNNContainer(userNN *v1beta1.NetworkNeighborhood, containerName string) *v1beta1.NetworkNeighborhoodContainer { + if userNN == nil { + return nil + } + for i := range userNN.Spec.Containers { + if userNN.Spec.Containers[i].Name == containerName { + return &userNN.Spec.Containers[i] + } + } + for i := range userNN.Spec.InitContainers { + if userNN.Spec.InitContainers[i].Name == containerName { + return &userNN.Spec.InitContainers[i] + } + } + for i := range userNN.Spec.EphemeralContainers { + if userNN.Spec.EphemeralContainers[i].Name == containerName { + return &userNN.Spec.EphemeralContainers[i] + } + } + return nil +} + +func userAPNames(userAP *v1beta1.ApplicationProfile) map[string]struct{} { + names := map[string]struct{}{} + if userAP == nil { + return names + } + for _, c := range userAP.Spec.Containers { + names[c.Name] = struct{}{} + } + for _, c := range userAP.Spec.InitContainers { + names[c.Name] = struct{}{} + } + for _, c := range userAP.Spec.EphemeralContainers { + names[c.Name] = struct{}{} + } + return names +} + +func userNNNames(userNN *v1beta1.NetworkNeighborhood) map[string]struct{} { + names := map[string]struct{}{} + if userNN == nil { + return names + } + for _, c := range userNN.Spec.Containers { + names[c.Name] = struct{}{} + } + for _, c := range userNN.Spec.InitContainers { + names[c.Name] = struct{}{} + } + for _, c := range userNN.Spec.EphemeralContainers { + names[c.Name] = struct{}{} + } + return names +} + +// missingPodContainers returns the set of pod-spec container names that are +// not present in the given set. If pod is nil, returns nil (check skipped). +func missingPodContainers(pod *corev1.Pod, have map[string]struct{}) []string { + if pod == nil { + return nil + } + var missing []string + for _, c := range pod.Spec.Containers { + if _, ok := have[c.Name]; !ok { + missing = append(missing, c.Name) + } + } + for _, c := range pod.Spec.InitContainers { + if _, ok := have[c.Name]; !ok { + missing = append(missing, c.Name) + } + } + for _, c := range pod.Spec.EphemeralContainers { + if _, ok := have[c.Name]; !ok { + missing = append(missing, c.Name) + } + } + return missing +} + +// mergeNetworkNeighbors merges user neighbors into a normal-neighbor list, +// keyed by Identifier. ported from +// pkg/objectcache/networkneighborhoodcache/networkneighborhoodcache.go:617-636. +func mergeNetworkNeighbors(normalNeighbors, userNeighbors []v1beta1.NetworkNeighbor) []v1beta1.NetworkNeighbor { + neighborMap := make(map[string]int, len(normalNeighbors)) + for i, neighbor := range normalNeighbors { + neighborMap[neighbor.Identifier] = i + } + for _, userNeighbor := range userNeighbors { + if idx, exists := neighborMap[userNeighbor.Identifier]; exists { + normalNeighbors[idx] = mergeNetworkNeighbor(normalNeighbors[idx], userNeighbor) + } else { + normalNeighbors = append(normalNeighbors, userNeighbor) + } + } + return normalNeighbors +} + +// mergeNetworkNeighbor merges a user-managed neighbor into an existing one. +// ported from +// pkg/objectcache/networkneighborhoodcache/networkneighborhoodcache.go:638-706. +func mergeNetworkNeighbor(normal, user v1beta1.NetworkNeighbor) v1beta1.NetworkNeighbor { + merged := normal.DeepCopy() + + dnsNamesSet := make(map[string]struct{}) + for _, dns := range normal.DNSNames { + dnsNamesSet[dns] = struct{}{} + } + for _, dns := range user.DNSNames { + dnsNamesSet[dns] = struct{}{} + } + merged.DNSNames = make([]string, 0, len(dnsNamesSet)) + for dns := range dnsNamesSet { + merged.DNSNames = append(merged.DNSNames, dns) + } + + merged.Ports = mergeNetworkPorts(merged.Ports, user.Ports) + + if user.PodSelector != nil { + if merged.PodSelector == nil { + merged.PodSelector = &metav1.LabelSelector{} + } + if user.PodSelector.MatchLabels != nil { + if merged.PodSelector.MatchLabels == nil { + merged.PodSelector.MatchLabels = make(map[string]string) + } + for k, v := range user.PodSelector.MatchLabels { + merged.PodSelector.MatchLabels[k] = v + } + } + merged.PodSelector.MatchExpressions = append( + merged.PodSelector.MatchExpressions, + user.PodSelector.MatchExpressions..., + ) + } + + if user.NamespaceSelector != nil { + if merged.NamespaceSelector == nil { + merged.NamespaceSelector = &metav1.LabelSelector{} + } + if user.NamespaceSelector.MatchLabels != nil { + if merged.NamespaceSelector.MatchLabels == nil { + merged.NamespaceSelector.MatchLabels = make(map[string]string) + } + for k, v := range user.NamespaceSelector.MatchLabels { + merged.NamespaceSelector.MatchLabels[k] = v + } + } + merged.NamespaceSelector.MatchExpressions = append( + merged.NamespaceSelector.MatchExpressions, + user.NamespaceSelector.MatchExpressions..., + ) + } + + if user.IPAddress != "" { + merged.IPAddress = user.IPAddress + } + if user.Type != "" { + merged.Type = user.Type + } + + return *merged +} + +// mergeNetworkPorts merges user ports into a normal-ports list, keyed by Name. +// ported from +// pkg/objectcache/networkneighborhoodcache/networkneighborhoodcache.go:708-727. +func mergeNetworkPorts(normalPorts, userPorts []v1beta1.NetworkPort) []v1beta1.NetworkPort { + portMap := make(map[string]int, len(normalPorts)) + for i, port := range normalPorts { + portMap[port.Name] = i + } + for _, userPort := range userPorts { + if idx, exists := portMap[userPort.Name]; exists { + normalPorts[idx] = userPort + } else { + normalPorts = append(normalPorts, userPort) + } + } + return normalPorts +} diff --git a/pkg/objectcache/containerprofilecache/projection_test.go b/pkg/objectcache/containerprofilecache/projection_test.go new file mode 100644 index 0000000000..85b106ee01 --- /dev/null +++ b/pkg/objectcache/containerprofilecache/projection_test.go @@ -0,0 +1,222 @@ +package containerprofilecache + +import ( + "testing" + + "github.com/kubescape/storage/pkg/apis/softwarecomposition/v1beta1" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func baseCP() *v1beta1.ContainerProfile { + return &v1beta1.ContainerProfile{ + ObjectMeta: metav1.ObjectMeta{Name: "cp", Namespace: "default", ResourceVersion: "1"}, + Spec: v1beta1.ContainerProfileSpec{ + Capabilities: []string{"SYS_PTRACE"}, + Execs: []v1beta1.ExecCalls{ + {Path: "/bin/ls", Args: []string{"-la"}}, + }, + PolicyByRuleId: map[string]v1beta1.RulePolicy{ + "R0901": {AllowedProcesses: []string{"ls"}}, + }, + Ingress: []v1beta1.NetworkNeighbor{ + {Identifier: "ing-1", DNSNames: []string{"a.svc.local"}}, + }, + }, + } +} + +func podWith(containers ...string) *corev1.Pod { + var cs []corev1.Container + for _, n := range containers { + cs = append(cs, corev1.Container{Name: n}) + } + return &corev1.Pod{Spec: corev1.PodSpec{Containers: cs}} +} + +// TestProjection_UserAPOnly_Match verifies the happy-path merge of a matching +// user AP container: capabilities / execs / policies merged, no warnings. +func TestProjection_UserAPOnly_Match(t *testing.T) { + cp := baseCP() + userAP := &v1beta1.ApplicationProfile{ + ObjectMeta: metav1.ObjectMeta{Name: "ua", Namespace: "default", ResourceVersion: "u1"}, + Spec: v1beta1.ApplicationProfileSpec{ + Containers: []v1beta1.ApplicationProfileContainer{{ + Name: "nginx", + Capabilities: []string{"NET_BIND_SERVICE"}, + Execs: []v1beta1.ExecCalls{{Path: "/bin/cat"}}, + PolicyByRuleId: map[string]v1beta1.RulePolicy{ + "R0901": {AllowedProcesses: []string{"cat"}}, + "R0902": {AllowedProcesses: []string{"echo"}}, + }, + }}, + }, + } + pod := podWith("nginx") + + projected, warnings := projectUserProfiles(cp, userAP, nil, pod, "nginx") + require.NotNil(t, projected) + assert.Empty(t, warnings) + assert.NotSame(t, cp, projected, "projected must be a distinct DeepCopy") + assert.ElementsMatch(t, []string{"SYS_PTRACE", "NET_BIND_SERVICE"}, projected.Spec.Capabilities) + assert.Len(t, projected.Spec.Execs, 2) + // R0901 merged, R0902 added + assert.Contains(t, projected.Spec.PolicyByRuleId, "R0901") + assert.Contains(t, projected.Spec.PolicyByRuleId, "R0902") +} + +// TestProjection_UserNNOnly_Match verifies merge of matching NN container: +// ingress merged by Identifier, LabelSelector MatchLabels overlaid. +func TestProjection_UserNNOnly_Match(t *testing.T) { + cp := baseCP() + cp.Spec.LabelSelector = metav1.LabelSelector{MatchLabels: map[string]string{"app": "nginx"}} + userNN := &v1beta1.NetworkNeighborhood{ + ObjectMeta: metav1.ObjectMeta{Name: "un", Namespace: "default", ResourceVersion: "n1"}, + Spec: v1beta1.NetworkNeighborhoodSpec{ + LabelSelector: metav1.LabelSelector{ + MatchLabels: map[string]string{"env": "prod"}, + }, + Containers: []v1beta1.NetworkNeighborhoodContainer{{ + Name: "nginx", + Ingress: []v1beta1.NetworkNeighbor{ + {Identifier: "ing-1", DNSNames: []string{"b.svc.local"}}, + {Identifier: "ing-2", DNSNames: []string{"c.svc.local"}}, + }, + }}, + }, + } + pod := podWith("nginx") + + projected, warnings := projectUserProfiles(cp, nil, userNN, pod, "nginx") + require.NotNil(t, projected) + assert.Empty(t, warnings) + require.Len(t, projected.Spec.Ingress, 2) + // ing-1 merged (DNSNames union) + var merged v1beta1.NetworkNeighbor + for _, ing := range projected.Spec.Ingress { + if ing.Identifier == "ing-1" { + merged = ing + break + } + } + assert.ElementsMatch(t, []string{"a.svc.local", "b.svc.local"}, merged.DNSNames) + // LabelSelector overlaid + assert.Equal(t, "nginx", projected.Spec.LabelSelector.MatchLabels["app"]) + assert.Equal(t, "prod", projected.Spec.LabelSelector.MatchLabels["env"]) +} + +// TestProjection_Both verifies both AP and NN can overlay in a single call. +func TestProjection_Both(t *testing.T) { + cp := baseCP() + userAP := &v1beta1.ApplicationProfile{ + ObjectMeta: metav1.ObjectMeta{Name: "ua", Namespace: "default", ResourceVersion: "u1"}, + Spec: v1beta1.ApplicationProfileSpec{ + Containers: []v1beta1.ApplicationProfileContainer{{ + Name: "nginx", + Capabilities: []string{"NET_ADMIN"}, + }}, + }, + } + userNN := &v1beta1.NetworkNeighborhood{ + ObjectMeta: metav1.ObjectMeta{Name: "un", Namespace: "default", ResourceVersion: "n1"}, + Spec: v1beta1.NetworkNeighborhoodSpec{ + Containers: []v1beta1.NetworkNeighborhoodContainer{{ + Name: "nginx", + Ingress: []v1beta1.NetworkNeighbor{{Identifier: "ing-new"}}, + }}, + }, + } + pod := podWith("nginx") + + projected, warnings := projectUserProfiles(cp, userAP, userNN, pod, "nginx") + require.NotNil(t, projected) + assert.Empty(t, warnings) + assert.Contains(t, projected.Spec.Capabilities, "NET_ADMIN") + // Original ing-1 plus appended ing-new + assert.Len(t, projected.Spec.Ingress, 2) +} + +// TestProjection_UserAP_NonMatchingContainer verifies that when the user CRD +// doesn't include the target container name, no merge happens — but missing +// pod containers still produce a warning. +func TestProjection_UserAP_NonMatchingContainer(t *testing.T) { + cp := baseCP() + userAP := &v1beta1.ApplicationProfile{ + ObjectMeta: metav1.ObjectMeta{Name: "ua", Namespace: "default", ResourceVersion: "u1"}, + Spec: v1beta1.ApplicationProfileSpec{ + Containers: []v1beta1.ApplicationProfileContainer{{ + Name: "other", // not "nginx" + Capabilities: []string{"NET_BIND_SERVICE"}, + }}, + }, + } + pod := podWith("nginx", "sidecar") + + projected, warnings := projectUserProfiles(cp, userAP, nil, pod, "nginx") + require.NotNil(t, projected) + // No merge because no container matched "nginx" + assert.ElementsMatch(t, []string{"SYS_PTRACE"}, projected.Spec.Capabilities) + require.Len(t, warnings, 1) + assert.Equal(t, kindApplication, warnings[0].Kind) + assert.ElementsMatch(t, []string{"nginx", "sidecar"}, warnings[0].MissingContainers) +} + +// TestProjection_UserAP_PartialContainers verifies that when the user AP has +// one container but the pod has two, we emit a partial warning naming the +// missing pod container. +func TestProjection_UserAP_PartialContainers(t *testing.T) { + cp := baseCP() + userAP := &v1beta1.ApplicationProfile{ + ObjectMeta: metav1.ObjectMeta{Name: "ua", Namespace: "default", ResourceVersion: "u1"}, + Spec: v1beta1.ApplicationProfileSpec{ + Containers: []v1beta1.ApplicationProfileContainer{{ + Name: "nginx", + Capabilities: []string{"NET_BIND_SERVICE"}, + }}, + }, + } + pod := podWith("nginx", "sidecar") + + projected, warnings := projectUserProfiles(cp, userAP, nil, pod, "nginx") + require.NotNil(t, projected) + // Target container merged. + assert.Contains(t, projected.Spec.Capabilities, "NET_BIND_SERVICE") + require.Len(t, warnings, 1) + assert.Equal(t, kindApplication, warnings[0].Kind) + assert.Equal(t, []string{"sidecar"}, warnings[0].MissingContainers) +} + +// TestProjection_NoUserCRDs verifies projection with neither user CRD returns +// a DeepCopy (distinct pointer) and no warnings. +func TestProjection_NoUserCRDs(t *testing.T) { + cp := baseCP() + pod := podWith("nginx") + + projected, warnings := projectUserProfiles(cp, nil, nil, pod, "nginx") + require.NotNil(t, projected) + assert.Empty(t, warnings) + assert.NotSame(t, cp, projected) + assert.Equal(t, cp.Spec.Capabilities, projected.Spec.Capabilities) +} + +// TestProjection_NilPod verifies the merge still runs when pod is nil; the +// missing-container check is skipped (no warning emitted for partial). +func TestProjection_NilPod(t *testing.T) { + cp := baseCP() + userAP := &v1beta1.ApplicationProfile{ + ObjectMeta: metav1.ObjectMeta{Name: "ua", Namespace: "default", ResourceVersion: "u1"}, + Spec: v1beta1.ApplicationProfileSpec{ + Containers: []v1beta1.ApplicationProfileContainer{{ + Name: "nginx", + Capabilities: []string{"NET_BIND_SERVICE"}, + }}, + }, + } + + projected, warnings := projectUserProfiles(cp, userAP, nil, nil, "nginx") + require.NotNil(t, projected) + assert.Empty(t, warnings) + assert.Contains(t, projected.Spec.Capabilities, "NET_BIND_SERVICE") +} diff --git a/pkg/objectcache/containerprofilecache/reconciler.go b/pkg/objectcache/containerprofilecache/reconciler.go new file mode 100644 index 0000000000..29c0307af3 --- /dev/null +++ b/pkg/objectcache/containerprofilecache/reconciler.go @@ -0,0 +1,565 @@ +// Package containerprofilecache — reconciler.go +// +// The reconciler is the safety-net eviction path AND the freshness refresh +// loop. Each tick it: +// 1. reconcileOnce: evicts cache entries whose pod is gone or whose +// container is no longer Running. +// 2. refreshAllEntries (single-flight via atomic flag): re-fetches the +// consolidated CP, the workload-level AP+NN, the user-managed +// "ug-" AP+NN, and any label-referenced user AP/NN overlay, +// then rebuilds the projection iff any resourceVersion changed. Fast-skip +// when every RV matches what's already cached. +// +// RPC cost @ 300 containers / 30s cadence steady-state: up to 7 gets per +// entry per tick (CP + 3×AP + 3×NN). At 300 entries that's 70 RPC/s in the +// worst case, dropping close to 0 once fast-skip catches on. Most entries +// carry only workload-level AP+NN, so the common case is 3 RPC/tick per +// entry = 30 RPC/s. +package containerprofilecache + +import ( + "context" + "time" + + "github.com/kubescape/go-logger" + "github.com/kubescape/go-logger/helpers" + helpersv1 "github.com/kubescape/k8s-interface/instanceidhandler/v1/helpers" + "github.com/kubescape/node-agent/pkg/objectcache" + "github.com/kubescape/node-agent/pkg/objectcache/callstackcache" + "github.com/kubescape/node-agent/pkg/utils" + "github.com/kubescape/storage/pkg/apis/softwarecomposition/v1beta1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// tickLoop drives the reconciler. Evict runs synchronously on the tick; +// refresh runs on a single-flight goroutine guarded by refreshInProgress so a +// slow refresh never stacks. +func (c *ContainerProfileCacheImpl) tickLoop(ctx context.Context) { + if c.reconcileEvery == 0 { + c.reconcileEvery = defaultReconcileInterval + } + logger.L().Info("ContainerProfileCache reconciler started", + helpers.String("interval", c.reconcileEvery.String())) + ticker := time.NewTicker(c.reconcileEvery) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + logger.L().Info("ContainerProfileCache reconciler stopped") + return + case <-ticker.C: + start := time.Now() + entriesBefore := c.entries.Len() + pendingBefore := c.pending.Len() + c.reconcileOnce(ctx) + c.retryPendingEntries(ctx) + // Emit the debug breadcrumb only when something actually moved: + // entries delta != 0 OR pending delta != 0. Keeping the log gated + // avoids flooding the journal with identical zero-delta ticks while + // still leaving the observability hook for the test-regression + // investigations that motivated the log. + entriesAfter := c.entries.Len() + pendingAfter := c.pending.Len() + if entriesBefore != entriesAfter || pendingBefore != pendingAfter { + logger.L().Debug("ContainerProfileCache reconciler tick", + helpers.Int("entries_before", entriesBefore), + helpers.Int("entries_after", entriesAfter), + helpers.Int("pending_before", pendingBefore), + helpers.Int("pending_after", pendingAfter)) + } + c.metricsManager.ReportContainerProfileReconcilerDuration("evict", time.Since(start)) + if c.refreshInProgress.CompareAndSwap(false, true) { + go func() { + defer c.refreshInProgress.Store(false) + c.refreshAllEntries(ctx) + }() + } + } + } +} + +// reconcileOnce evicts cache entries whose container is no longer Running. +// Exposed (lowercase but package-public) for tests. +func (c *ContainerProfileCacheImpl) reconcileOnce(ctx context.Context) { + var toEvict []string + c.entries.Range(func(id string, e *CachedContainerProfile) bool { + if ctx.Err() != nil { // delta #3: honor cancellation mid-range + return false + } + pod := c.k8sObjectCache.GetPod(e.Namespace, e.PodName) + if pod == nil { + // Pod not yet in k8s cache (or briefly absent during watch + // resync). Do NOT evict — the pod cache routinely lags the + // ContainerCallback Add events by tens of seconds on busy nodes, + // and evicting here would churn every entry every tick until the + // cache catches up. Cleanup for terminated containers flows + // through deleteContainer on EventTypeRemoveContainer. + return true + } + // Only evict when the pod IS in cache AND the container has clearly + // exited (Terminated state). "Not yet Running" (Waiting state) is + // NOT a reason to evict — init containers and pre-running containers + // legitimately pass through Waiting before transitioning to Running. + if isContainerTerminated(pod, e, id) { + toEvict = append(toEvict, id) + } + return true + }) + for _, id := range toEvict { + c.containerLocks.WithLock(id, func() { + c.entries.Delete(id) + }) + // See deleteContainer comment on why we don't ReleaseLock here. + c.metricsManager.ReportContainerProfileReconcilerEviction("pod_stopped") + } + + // NOTE: we intentionally do NOT GC pending entries based on pod state. + // A previous version dropped pending entries when GetPod returned nil or + // the container wasn't yet Running — but the k8s pod cache and container + // statuses lag the containerwatcher Add event by tens of seconds on busy + // nodes, so the GC dropped every pending entry before retries had a + // chance to succeed. Cleanup for terminated containers flows through + // deleteContainer (EventTypeRemoveContainer) which clears both entries + // and pending atomically. Memory growth from stuck-pending entries is + // bounded by the node's container churn. + + c.metricsManager.SetContainerProfileCacheEntries("total", float64(c.entries.Len())) + c.metricsManager.SetContainerProfileCacheEntries("pending", float64(c.pending.Len())) +} + +// isContainerRunning reports whether the container identified by `id` (the +// cache key, a trimmed containerID) or by (e.ContainerName, e.PodUID) is in +// State=Running in the pod's container/initContainer/ephemeralContainer +// statuses. +// +// Pre-running init containers can appear with an empty ContainerID in the +// status (kubelet hasn't published it yet). In that case we fall back to +// matching on (Name, PodUID) so we don't prematurely evict the entry the +// instant it's populated. +// isContainerTerminated reports whether the container identified by `id` or +// by (e.ContainerName, e.PodUID) has a Terminated state in the pod's +// container/initContainer/ephemeralContainer statuses. This is stricter than +// "not Running": a container in Waiting state is NOT considered terminated. +// Used by reconcileOnce as the eviction signal. +func isContainerTerminated(pod *corev1.Pod, e *CachedContainerProfile, id string) bool { + statuses := make([]corev1.ContainerStatus, 0, + len(pod.Status.ContainerStatuses)+ + len(pod.Status.InitContainerStatuses)+ + len(pod.Status.EphemeralContainerStatuses)) + statuses = append(statuses, pod.Status.ContainerStatuses...) + statuses = append(statuses, pod.Status.InitContainerStatuses...) + statuses = append(statuses, pod.Status.EphemeralContainerStatuses...) + for _, s := range statuses { + if s.ContainerID == "" { + if s.Name == e.ContainerName && string(pod.UID) == e.PodUID { + return s.State.Terminated != nil + } + continue + } + if utils.TrimRuntimePrefix(s.ContainerID) == id { + return s.State.Terminated != nil + } + } + // Container not found in any status list. If no statuses have been + // published yet (kubelet lag on a brand-new pod), do NOT evict — the + // empty list is indistinguishable from a fully-reaped container otherwise. + if len(statuses) == 0 { + return false + } + // Statuses were published but this container is absent: it was reaped. + return true +} + +func isContainerRunning(pod *corev1.Pod, e *CachedContainerProfile, id string) bool { + statuses := make([]corev1.ContainerStatus, 0, + len(pod.Status.ContainerStatuses)+ + len(pod.Status.InitContainerStatuses)+ + len(pod.Status.EphemeralContainerStatuses)) + statuses = append(statuses, pod.Status.ContainerStatuses...) + statuses = append(statuses, pod.Status.InitContainerStatuses...) + statuses = append(statuses, pod.Status.EphemeralContainerStatuses...) + for _, s := range statuses { + if s.ContainerID == "" { + // pre-running init container: match by (Name, PodUID) + if s.Name == e.ContainerName && string(pod.UID) == e.PodUID { + return s.State.Running != nil + } + continue + } + if utils.TrimRuntimePrefix(s.ContainerID) == id { + return s.State.Running != nil + } + } + return false +} + +// refreshAllEntries re-fetches CP + user AP/NN for each cache entry and +// updates the projection if any ResourceVersion changed. Fast-skip when RV + +// UserAPRV + UserNNRV all match (delta #4). Exposed for tests. +func (c *ContainerProfileCacheImpl) refreshAllEntries(ctx context.Context) { + start := time.Now() + defer func() { + c.metricsManager.ReportContainerProfileReconcilerDuration("refresh", time.Since(start)) + }() + // Snapshot first to avoid holding SafeMap's RLock while refreshOneEntry + // writes back via Set (which needs the write lock). + type snapshot struct { + id string + e *CachedContainerProfile + } + var work []snapshot + c.entries.Range(func(id string, e *CachedContainerProfile) bool { + if ctx.Err() != nil { // delta #3 + return false + } + work = append(work, snapshot{id: id, e: e}) + return true + }) + for _, w := range work { + if ctx.Err() != nil { + return + } + c.containerLocks.WithLock(w.id, func() { + c.refreshOneEntry(ctx, w.id, w.e) + }) + } +} + +// refreshOneEntry refreshes a single cache entry under the per-container lock. +// Re-fetches ALL sources the entry was originally built from (consolidated CP, +// workload-level AP/NN, user-managed AP/NN at "ug-", and any +// label-referenced user AP/NN overlay) and rebuilds the projection if ANY +// ResourceVersion changed. Keeping the existing entry on fetch errors is fine: +// the next tick will retry. +// +// Rebuild on refresh applies the same projection ladder as tryPopulateEntry: +// +// base CP → workload AP+NN → user-managed (ug-) AP+NN → user overlay AP+NN. +// +// We intentionally DO NOT re-apply the partial-on-non-PreRunning gate here: +// any entry that survived addContainer already passed that gate (or was +// PreRunning), so refresh can accept partial profiles freely. (Fix B for +// Test_17 / Test_19: the workload AP/NN must be re-fetched each tick so a +// "ready" -> "completed" transition propagates to ProfileState.Status, which +// in turn promotes fail_on_profile from false to true.) +func (c *ContainerProfileCacheImpl) refreshOneEntry(ctx context.Context, id string, e *CachedContainerProfile) { + // Resurrection guard (reviewer #1): refreshAllEntries snapshots entries + // without holding containerLocks, so a concurrent deleteContainer / + // reconcile-evict may have removed the entry between snapshot and lock + // acquisition. If so, bail; otherwise the rebuild's c.entries.Set would + // resurrect a dead container. + if _, still := c.entries.Load(id); !still { + return + } + + ns := e.Namespace + + // Re-fetch all sources. CP fetch errors (including 404) are treated as + // "not available right now" — mirroring tryPopulateEntry's behavior. We + // leave cp=nil and rely on the RV-match fast-skip below to preserve the + // existing entry when nothing has changed. This is what lets refresh + // pick up workload-level AP/NN transitions ("ready" -> "completed") even + // while the storage-side consolidated CP remains unpublished. + var cp *v1beta1.ContainerProfile + var cpErr error + _ = c.refreshRPC(ctx, func(rctx context.Context) error { + cp, cpErr = c.storageClient.GetContainerProfile(rctx, ns, e.CPName) + return cpErr + }) + if cpErr != nil { + // If the previous entry was built off a real CP (non-empty RV), a + // CP fetch error on this tick is transient — keep the entry as-is. + // If the entry never had a CP (RV == "", pure workload/user-managed + // build), treat the error as 404 and let workload/user-managed + // re-fetches drive any refresh. + if e.RV != "" { + logger.L().Debug("refreshOneEntry: CP fetch failed; keeping cached entry", + helpers.String("containerID", id), + helpers.String("cpName", e.CPName), + helpers.Error(cpErr)) + return + } + logger.L().Debug("refreshOneEntry: CP fetch failed (no prior CP); treating as not-available", + helpers.String("containerID", id), + helpers.String("cpName", e.CPName), + helpers.Error(cpErr)) + cp = nil + } + var userManagedAP *v1beta1.ApplicationProfile + var userManagedNN *v1beta1.NetworkNeighborhood + if e.WorkloadName != "" { + ugAPName := helpersv1.UserApplicationProfilePrefix + e.WorkloadName + var userManagedAPErr error + _ = c.refreshRPC(ctx, func(rctx context.Context) error { + userManagedAP, userManagedAPErr = c.storageClient.GetApplicationProfile(rctx, ns, ugAPName) + return userManagedAPErr + }) + if userManagedAPErr != nil && e.UserManagedAPRV != "" { + logger.L().Debug("refreshOneEntry: user-managed AP fetch failed; keeping cached entry", + helpers.String("containerID", id), + helpers.String("name", ugAPName), + helpers.Error(userManagedAPErr)) + return + } + if userManagedAPErr != nil { + userManagedAP = nil // k8s client returns non-nil zero-value on 404; treat as absent + } + ugNNName := helpersv1.UserNetworkNeighborhoodPrefix + e.WorkloadName + var userManagedNNErr error + _ = c.refreshRPC(ctx, func(rctx context.Context) error { + userManagedNN, userManagedNNErr = c.storageClient.GetNetworkNeighborhood(rctx, ns, ugNNName) + return userManagedNNErr + }) + if userManagedNNErr != nil && e.UserManagedNNRV != "" { + logger.L().Debug("refreshOneEntry: user-managed NN fetch failed; keeping cached entry", + helpers.String("containerID", id), + helpers.String("name", ugNNName), + helpers.Error(userManagedNNErr)) + return + } + if userManagedNNErr != nil { + userManagedNN = nil + } + } + var userAP *v1beta1.ApplicationProfile + var userNN *v1beta1.NetworkNeighborhood + if e.UserAPRef != nil { + var userAPErr error + _ = c.refreshRPC(ctx, func(rctx context.Context) error { + userAP, userAPErr = c.storageClient.GetApplicationProfile(rctx, e.UserAPRef.Namespace, e.UserAPRef.Name) + return userAPErr + }) + if userAPErr != nil && e.UserAPRV != "" { + logger.L().Debug("refreshOneEntry: user-defined AP fetch failed; keeping cached entry", + helpers.String("containerID", id), + helpers.String("name", e.UserAPRef.Name), + helpers.Error(userAPErr)) + return + } + if userAPErr != nil { + userAP = nil + } + } + if e.UserNNRef != nil { + var userNNErr error + _ = c.refreshRPC(ctx, func(rctx context.Context) error { + userNN, userNNErr = c.storageClient.GetNetworkNeighborhood(rctx, e.UserNNRef.Namespace, e.UserNNRef.Name) + return userNNErr + }) + if userNNErr != nil && e.UserNNRV != "" { + logger.L().Debug("refreshOneEntry: user-defined NN fetch failed; keeping cached entry", + helpers.String("containerID", id), + helpers.String("name", e.UserNNRef.Name), + helpers.Error(userNNErr)) + return + } + if userNNErr != nil { + userNN = nil + } + } + + // Fast-skip when nothing changed. We match "absent" (nil) with empty RV: + // this avoids spurious rebuilds when an optional source is still missing, + // as long as it was also missing at the last build. + if rvsMatchCP(cp, e.RV) && + rvsMatchAP(userManagedAP, e.UserManagedAPRV) && + rvsMatchNN(userManagedNN, e.UserManagedNNRV) && + rvsMatchAP(userAP, e.UserAPRV) && + rvsMatchNN(userNN, e.UserNNRV) { + return + } + + c.rebuildEntryFromSources(id, e, cp, userManagedAP, userManagedNN, userAP, userNN) +} + +// rvsMatchCP, rvsMatchAP, rvsMatchNN return true when either (a) the object is +// absent and the stored RV is empty, or (b) the object is present and its RV +// matches the stored RV. This lets fast-skip treat "still missing" as a match. +func rvsMatchCP(obj *v1beta1.ContainerProfile, rv string) bool { + if obj == nil { + return rv == "" + } + return obj.ResourceVersion == rv +} +func rvsMatchAP(obj *v1beta1.ApplicationProfile, rv string) bool { + if obj == nil { + return rv == "" + } + return obj.ResourceVersion == rv +} +func rvsMatchNN(obj *v1beta1.NetworkNeighborhood, rv string) bool { + if obj == nil { + return rv == "" + } + return obj.ResourceVersion == rv +} + +// rebuildEntryFromSources constructs a fresh CachedContainerProfile from the +// given sources and stores it under `id`. Applies the projection ladder from +// tryPopulateEntry: base CP (or synthesized) → user-managed (ug-) AP+NN → +// label-referenced user overlay AP+NN. +// +// Called by the reconciler when any input ResourceVersion has changed. +func (c *ContainerProfileCacheImpl) rebuildEntryFromSources( + id string, + prev *CachedContainerProfile, + cp *v1beta1.ContainerProfile, + userManagedAP *v1beta1.ApplicationProfile, + userManagedNN *v1beta1.NetworkNeighborhood, + userAP *v1beta1.ApplicationProfile, + userNN *v1beta1.NetworkNeighborhood, +) { + pod := c.k8sObjectCache.GetPod(prev.Namespace, prev.PodName) + + // Backfill PodUID when the entry was originally added before the pod + // appeared in the k8s cache. An empty PodUID on a pre-running init + // container (where the pod-status ContainerID is also empty) makes + // isContainerTerminated's (Name, PodUID) fallback match zero and treat + // the entry as terminated on the next eviction pass. Healing it here + // lets the next reconcileOnce correctly classify the container. + podUID := prev.PodUID + if podUID == "" && pod != nil { + podUID = string(pod.UID) + } + + // When the consolidated CP is absent but we still have user-managed / + // user-defined overlays to project, synthesize an empty base so + // downstream state display is sensible. + effectiveCP := cp + if effectiveCP == nil { + syntheticName := prev.WorkloadName + if syntheticName == "" { + syntheticName = prev.CPName + } + effectiveCP = &v1beta1.ContainerProfile{ + ObjectMeta: metav1.ObjectMeta{ + Name: syntheticName, + Namespace: prev.Namespace, + Annotations: map[string]string{ + helpersv1.CompletionMetadataKey: helpersv1.Full, + helpersv1.StatusMetadataKey: helpersv1.Completed, + }, + }, + } + } + + projected := effectiveCP + // Ladder pass #1: user-managed "ug-" AP + NN. + if userManagedAP != nil || userManagedNN != nil { + p, warnings := projectUserProfiles(projected, userManagedAP, userManagedNN, pod, prev.ContainerName) + projected = p + c.emitOverlayMetrics(userManagedAP, userManagedNN, warnings) + } + // Ladder pass #2: label-referenced user overlay AP + NN. + shared := userAP == nil && userNN == nil && + userManagedAP == nil && userManagedNN == nil && + cp != nil + var userWarnings []partialProfileWarning + if userAP != nil || userNN != nil { + p, w := projectUserProfiles(projected, userAP, userNN, pod, prev.ContainerName) + projected = p + userWarnings = w + } + c.emitOverlayMetrics(userAP, userNN, userWarnings) + + // Rebuild the call-stack search tree from the projected profile. + tree := callstackcache.NewCallStackSearchTree() + for _, stack := range projected.Spec.IdentifiedCallStacks { + tree.AddCallStack(stack) + } + + newEntry := &CachedContainerProfile{ + Profile: projected, + State: &objectcache.ProfileState{Completion: effectiveCP.Annotations[helpersv1.CompletionMetadataKey], Status: effectiveCP.Annotations[helpersv1.StatusMetadataKey], Name: effectiveCP.Name}, + CallStackTree: tree, + ContainerName: prev.ContainerName, + PodName: prev.PodName, + Namespace: prev.Namespace, + PodUID: podUID, + WorkloadID: prev.WorkloadID, + CPName: prev.CPName, + WorkloadName: prev.WorkloadName, + Shared: shared, + RV: rvOfCP(cp), + UserManagedAPRV: rvOfAP(userManagedAP), + UserManagedNNRV: rvOfNN(userManagedNN), + UserAPRV: rvOfAP(userAP), + UserNNRV: rvOfNN(userNN), + } + if userAP != nil { + newEntry.UserAPRef = &namespacedName{Namespace: userAP.Namespace, Name: userAP.Name} + } else if prev.UserAPRef != nil { + // Preserve the ref so subsequent ticks still know to re-fetch the + // overlay (e.g. transient fetch error during this tick). + newEntry.UserAPRef = prev.UserAPRef + } + if userNN != nil { + newEntry.UserNNRef = &namespacedName{Namespace: userNN.Namespace, Name: userNN.Name} + } else if prev.UserNNRef != nil { + newEntry.UserNNRef = prev.UserNNRef + } + + c.entries.Set(id, newEntry) +} + +// rvOfCP / rvOfAP / rvOfNN return the object's ResourceVersion or "" when nil. +// Separate typed versions avoid the Go nil-interface trap where a typed-nil +// pointer wrapped in an interface is not == nil. +func rvOfCP(o *v1beta1.ContainerProfile) string { + if o == nil { + return "" + } + return o.ResourceVersion +} +func rvOfAP(o *v1beta1.ApplicationProfile) string { + if o == nil { + return "" + } + return o.ResourceVersion +} +func rvOfNN(o *v1beta1.NetworkNeighborhood) string { + if o == nil { + return "" + } + return o.ResourceVersion +} + +// retryPendingEntries re-issues GetContainerProfile for every containerID that +// was seen on ContainerCallback(Add) but whose CP was not yet in storage. On +// success the entry is promoted into the main cache and removed from pending. +// Exposed for tests. +// +// This preserves the legacy-cache behavior where the periodic "ListProfiles" +// tick recovered containers whose CP showed up after container-start. Without +// this retry, a container whose CP is created asynchronously (the normal +// path, since containerprofilemanager creates the CP after observing behavior) +// would never enter the cache. See component-test regression analysis at +// .omc/plans/containerprofile-cache-component-test-findings.md. +func (c *ContainerProfileCacheImpl) retryPendingEntries(ctx context.Context) { + type snap struct { + id string + p *pendingContainer + } + var work []snap + c.pending.Range(func(id string, p *pendingContainer) bool { + if ctx.Err() != nil { + return false + } + work = append(work, snap{id: id, p: p}) + return true + }) + for _, w := range work { + if ctx.Err() != nil { + return + } + c.containerLocks.WithLock(w.id, func() { + // Double-check pending still contains this id (could have been + // promoted or dropped by a concurrent path). + if _, still := c.pending.Load(w.id); !still { + return + } + c.tryPopulateEntry(ctx, w.id, w.p.container, w.p.sharedData, w.p.cpName, w.p.workloadName) + }) + } +} diff --git a/pkg/objectcache/containerprofilecache/reconciler_test.go b/pkg/objectcache/containerprofilecache/reconciler_test.go new file mode 100644 index 0000000000..0bdf92f180 --- /dev/null +++ b/pkg/objectcache/containerprofilecache/reconciler_test.go @@ -0,0 +1,1199 @@ +package containerprofilecache + +import ( + "context" + "sync" + "sync/atomic" + "testing" + "time" + + helpersv1 "github.com/kubescape/k8s-interface/instanceidhandler/v1/helpers" + "github.com/kubescape/node-agent/pkg/config" + "github.com/kubescape/node-agent/pkg/metricsmanager" + "github.com/kubescape/node-agent/pkg/objectcache" + "github.com/kubescape/node-agent/pkg/storage" + "github.com/kubescape/storage/pkg/apis/softwarecomposition/v1beta1" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" +) + +// controllableK8sCache is a K8sObjectCache stub whose GetPod can be scripted +// per (namespace, podName) and whose invocation count is observable for the +// cancellation test. The unexported methods required by the interface are +// implemented as no-ops. +type controllableK8sCache struct { + pods map[string]*corev1.Pod + podHook func(namespace, podName string) *corev1.Pod // optional override + calls atomic.Int64 +} + +var _ objectcache.K8sObjectCache = (*controllableK8sCache)(nil) + +func newControllableK8sCache() *controllableK8sCache { + return &controllableK8sCache{pods: map[string]*corev1.Pod{}} +} + +func (k *controllableK8sCache) setPod(namespace, podName string, pod *corev1.Pod) { + k.pods[namespace+"/"+podName] = pod +} + +func (k *controllableK8sCache) GetPod(namespace, podName string) *corev1.Pod { + k.calls.Add(1) + if k.podHook != nil { + return k.podHook(namespace, podName) + } + if p, ok := k.pods[namespace+"/"+podName]; ok { + return p + } + return nil +} +func (k *controllableK8sCache) GetPodSpec(_, _ string) *corev1.PodSpec { return nil } +func (k *controllableK8sCache) GetPodStatus(_, _ string) *corev1.PodStatus { return nil } +func (k *controllableK8sCache) GetApiServerIpAddress() string { return "" } +func (k *controllableK8sCache) GetPods() []*corev1.Pod { return nil } +func (k *controllableK8sCache) SetSharedContainerData(_ string, _ *objectcache.WatchedContainerData) { +} +func (k *controllableK8sCache) GetSharedContainerData(_ string) *objectcache.WatchedContainerData { + return nil +} +func (k *controllableK8sCache) DeleteSharedContainerData(_ string) {} + +// countingProfileClient tracks per-method RPC counts so tests can assert +// fast-skip behavior. +type countingProfileClient struct { + cp *v1beta1.ContainerProfile + ap *v1beta1.ApplicationProfile + nn *v1beta1.NetworkNeighborhood + + cpCalls atomic.Int64 + apCalls atomic.Int64 + nnCalls atomic.Int64 +} + +var _ storage.ProfileClient = (*countingProfileClient)(nil) + +func (f *countingProfileClient) GetContainerProfile(_ context.Context, _, _ string) (*v1beta1.ContainerProfile, error) { + f.cpCalls.Add(1) + return f.cp, nil +} +func (f *countingProfileClient) GetApplicationProfile(_ context.Context, _, _ string) (*v1beta1.ApplicationProfile, error) { + f.apCalls.Add(1) + return f.ap, nil +} +func (f *countingProfileClient) GetNetworkNeighborhood(_ context.Context, _, _ string) (*v1beta1.NetworkNeighborhood, error) { + f.nnCalls.Add(1) + return f.nn, nil +} +func (f *countingProfileClient) ListApplicationProfiles(_ context.Context, _ string, _ int64, _ string) (*v1beta1.ApplicationProfileList, error) { + return &v1beta1.ApplicationProfileList{}, nil +} +func (f *countingProfileClient) ListNetworkNeighborhoods(_ context.Context, _ string, _ int64, _ string) (*v1beta1.NetworkNeighborhoodList, error) { + return &v1beta1.NetworkNeighborhoodList{}, nil +} + +// countingMetrics tallies ReportContainerProfileLegacyLoad calls so the T8 +// end-to-end test can assert the overlay refresh re-emits the full-load signal. +type countingMetrics struct { + metricsmanager.MetricsMock + mu sync.Mutex + legacyLoads map[string]int // key = kind+"|"+completeness + evictions map[string]int + entriesByKnd map[string]float64 +} + +func newCountingMetrics() *countingMetrics { + return &countingMetrics{ + legacyLoads: map[string]int{}, + evictions: map[string]int{}, + entriesByKnd: map[string]float64{}, + } +} +func (m *countingMetrics) ReportContainerProfileLegacyLoad(kind, completeness string) { + m.mu.Lock() + defer m.mu.Unlock() + m.legacyLoads[kind+"|"+completeness]++ +} +func (m *countingMetrics) ReportContainerProfileReconcilerEviction(reason string) { + m.mu.Lock() + defer m.mu.Unlock() + m.evictions[reason]++ +} +func (m *countingMetrics) SetContainerProfileCacheEntries(kind string, count float64) { + m.mu.Lock() + defer m.mu.Unlock() + m.entriesByKnd[kind] = count +} +func (m *countingMetrics) legacyLoad(kind, completeness string) int { + m.mu.Lock() + defer m.mu.Unlock() + return m.legacyLoads[kind+"|"+completeness] +} +func (m *countingMetrics) eviction(reason string) int { + m.mu.Lock() + defer m.mu.Unlock() + return m.evictions[reason] +} + +// newReconcilerCache returns a cache wired with a controllable k8s cache and +// a counting profile client. Tests drive reconcileOnce / refreshAllEntries +// directly. +func newReconcilerCache(t *testing.T, client storage.ProfileClient, k8s objectcache.K8sObjectCache, metrics metricsmanager.MetricsManager) *ContainerProfileCacheImpl { + t.Helper() + cfg := config.Config{ProfilesCacheRefreshRate: 30 * time.Second} + return NewContainerProfileCache(cfg, client, k8s, metrics) +} + +// newEntry makes a CachedContainerProfile for tests without going through +// addContainer (which requires priming shared data + instance-id machinery). +func newEntry(cp *v1beta1.ContainerProfile, containerName, podName, namespace, podUID string) *CachedContainerProfile { + return &CachedContainerProfile{ + Profile: cp, + State: &objectcache.ProfileState{Name: cp.Name}, + ContainerName: containerName, + PodName: podName, + Namespace: namespace, + PodUID: podUID, + CPName: cp.Name, + RV: cp.ResourceVersion, + Shared: true, + } +} + +// TestReconcilerKeepsEntryWhenPodMissing — entry whose pod returns nil is +// retained (not evicted). The k8s pod cache routinely lags container events +// on busy nodes; evicting on "pod not found" churned every entry per tick. +// Cleanup for terminated containers flows through deleteContainer. +func TestReconcilerKeepsEntryWhenPodMissing(t *testing.T) { + cp := &v1beta1.ContainerProfile{ObjectMeta: metav1.ObjectMeta{Name: "cp", Namespace: "default", ResourceVersion: "1"}} + client := &countingProfileClient{cp: cp} + k8s := newControllableK8sCache() // GetPod returns nil for everything + metrics := newCountingMetrics() + c := newReconcilerCache(t, client, k8s, metrics) + + id := "c1" + c.entries.Set(id, newEntry(cp, "nginx", "nginx-abc", "default", "uid-1")) + + c.reconcileOnce(context.Background()) + + assert.NotNil(t, c.GetContainerProfile(id), "entry must be retained when pod is missing from cache") + assert.Equal(t, 0, metrics.eviction("pod_stopped"), "no eviction when pod is absent") +} + +// TestReconcilerEvictsTerminatedContainer — entry whose container has +// clearly transitioned to Terminated state IS evicted. +func TestReconcilerEvictsTerminatedContainer(t *testing.T) { + cp := &v1beta1.ContainerProfile{ObjectMeta: metav1.ObjectMeta{Name: "cp", Namespace: "default", ResourceVersion: "1"}} + client := &countingProfileClient{cp: cp} + k8s := newControllableK8sCache() + id := "terminated123" + k8s.setPod("default", "nginx-abc", &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "nginx-abc", Namespace: "default", UID: types.UID("uid-1")}, + Status: corev1.PodStatus{ContainerStatuses: []corev1.ContainerStatus{{ + Name: "nginx", + ContainerID: "containerd://" + id, + State: corev1.ContainerState{Terminated: &corev1.ContainerStateTerminated{ExitCode: 0}}, + }}}, + }) + metrics := newCountingMetrics() + c := newReconcilerCache(t, client, k8s, metrics) + c.entries.Set(id, newEntry(cp, "nginx", "nginx-abc", "default", "uid-1")) + + c.reconcileOnce(context.Background()) + + assert.Nil(t, c.GetContainerProfile(id), "terminated container entry must be evicted") + assert.Equal(t, 1, metrics.eviction("pod_stopped"), "should report one eviction") +} + +// TestReconcilerKeepsWaitingContainer — entry whose container is in Waiting +// state (e.g. newly-started or pre-running init container with empty ID) +// must NOT be evicted. +func TestReconcilerKeepsWaitingContainer(t *testing.T) { + cp := &v1beta1.ContainerProfile{ObjectMeta: metav1.ObjectMeta{Name: "cp", Namespace: "default", ResourceVersion: "1"}} + client := &countingProfileClient{cp: cp} + k8s := newControllableK8sCache() + id := "waitingabc" + k8s.setPod("default", "nginx-abc", &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "nginx-abc", Namespace: "default", UID: types.UID("uid-1")}, + Status: corev1.PodStatus{ContainerStatuses: []corev1.ContainerStatus{{ + Name: "nginx", + ContainerID: "containerd://" + id, + State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "ContainerCreating"}}, + }}}, + }) + metrics := newCountingMetrics() + c := newReconcilerCache(t, client, k8s, metrics) + c.entries.Set(id, newEntry(cp, "nginx", "nginx-abc", "default", "uid-1")) + + c.reconcileOnce(context.Background()) + + assert.NotNil(t, c.GetContainerProfile(id), "waiting container entry must be retained") + assert.Equal(t, 0, metrics.eviction("pod_stopped"), "no eviction for Waiting state") +} + +// TestReconcilerKeepsRunningContainer — entry is kept when pod has a Running +// container status matching `id`. +func TestReconcilerKeepsRunningContainer(t *testing.T) { + cp := &v1beta1.ContainerProfile{ObjectMeta: metav1.ObjectMeta{Name: "cp", Namespace: "default", ResourceVersion: "1"}} + client := &countingProfileClient{cp: cp} + k8s := newControllableK8sCache() + id := "abc123" + k8s.setPod("default", "nginx-abc", &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "nginx-abc", Namespace: "default", UID: types.UID("uid-1")}, + Status: corev1.PodStatus{ContainerStatuses: []corev1.ContainerStatus{{ + Name: "nginx", + ContainerID: "containerd://" + id, + State: corev1.ContainerState{Running: &corev1.ContainerStateRunning{}}, + }}}, + }) + metrics := newCountingMetrics() + c := newReconcilerCache(t, client, k8s, metrics) + c.entries.Set(id, newEntry(cp, "nginx", "nginx-abc", "default", "uid-1")) + + c.reconcileOnce(context.Background()) + + assert.NotNil(t, c.GetContainerProfile(id), "running container entry must remain") + assert.Equal(t, 0, metrics.eviction("pod_stopped"), "should not evict a running entry") +} + +// TestIsContainerRunning_PreRunningInitWithEmptyContainerID — T2c from the +// plan risks. Pre-running init container publishes an empty ContainerID, so +// we fall back to (Name, PodUID) matching. +func TestIsContainerRunning_PreRunningInitWithEmptyContainerID(t *testing.T) { + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{UID: types.UID("pod-uid-123")}, + Status: corev1.PodStatus{InitContainerStatuses: []corev1.ContainerStatus{{ + Name: "init-1", + ContainerID: "", // not published yet + State: corev1.ContainerState{Running: &corev1.ContainerStateRunning{}}, + }}}, + } + entry := &CachedContainerProfile{ContainerName: "init-1", PodUID: "pod-uid-123"} + assert.True(t, isContainerRunning(pod, entry, "init-cid"), + "pre-running init container with empty ContainerID must match on (Name, PodUID)") +} + +// TestIsContainerRunning_ContainerIDMatchTakesPriority — the containerd:// etc +// prefix is stripped before comparing against the cache key. +func TestIsContainerRunning_ContainerIDMatchTakesPriority(t *testing.T) { + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{UID: types.UID("pod-uid-123")}, + Status: corev1.PodStatus{ContainerStatuses: []corev1.ContainerStatus{{ + Name: "nginx", + ContainerID: "docker://abc", + State: corev1.ContainerState{Running: &corev1.ContainerStateRunning{}}, + }}}, + } + entry := &CachedContainerProfile{ContainerName: "nginx", PodUID: "pod-uid-123"} + assert.True(t, isContainerRunning(pod, entry, "abc"), "docker:// prefix should be stripped") + assert.False(t, isContainerRunning(pod, entry, "zzz"), "id mismatch should return false") +} + +// TestIsContainerRunning_NotRunning — container exists but is Terminated. +func TestIsContainerRunning_NotRunning(t *testing.T) { + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{UID: types.UID("pod-uid-123")}, + Status: corev1.PodStatus{ContainerStatuses: []corev1.ContainerStatus{{ + Name: "nginx", + ContainerID: "containerd://abc", + State: corev1.ContainerState{Terminated: &corev1.ContainerStateTerminated{ExitCode: 0}}, + }}}, + } + entry := &CachedContainerProfile{ContainerName: "nginx", PodUID: "pod-uid-123"} + assert.False(t, isContainerRunning(pod, entry, "abc")) +} + +// TestReconcilerExitsOnCtxCancel — R2 from plan risks, delta #3. Cancelling +// ctx mid-Range stops iteration early. +func TestReconcilerExitsOnCtxCancel(t *testing.T) { + cp := &v1beta1.ContainerProfile{ObjectMeta: metav1.ObjectMeta{Name: "cp", Namespace: "default", ResourceVersion: "1"}} + client := &countingProfileClient{cp: cp} + k8s := newControllableK8sCache() + ctx, cancel := context.WithCancel(context.Background()) + // Hook: cancel ctx on the 3rd GetPod call, return nil to drive the + // Range's continuation. After cancel(), ctx.Err() is set and subsequent + // Range iterations should short-circuit. + var visits atomic.Int64 + k8s.podHook = func(_, _ string) *corev1.Pod { + visits.Add(1) + if visits.Load() == 3 { + cancel() + } + return nil + } + metrics := newCountingMetrics() + c := newReconcilerCache(t, client, k8s, metrics) + + // Populate 100 entries. + for i := 0; i < 100; i++ { + id := "c-" + itoa(i) + c.entries.Set(id, newEntry(cp, "nginx", "pod-"+itoa(i), "default", "uid-"+itoa(i))) + } + + c.reconcileOnce(ctx) + + got := visits.Load() + assert.Less(t, got, int64(100), "ctx cancel should short-circuit the Range well before 100 iterations") + assert.GreaterOrEqual(t, got, int64(3), "should observe at least the iterations up to cancel") + // We do NOT assert a specific eviction count: entries visited before the + // cancel were appended to toEvict and DO get evicted. The invariant under + // test is only that iteration stopped early. +} + +// TestRefreshFastSkipWhenAllRVsMatch — delta #4. When CP RV and both overlay +// RVs match the cached values, refreshOneEntry returns without rebuilding. +func TestRefreshFastSkipWhenAllRVsMatch(t *testing.T) { + cp := &v1beta1.ContainerProfile{ObjectMeta: metav1.ObjectMeta{Name: "cp", Namespace: "default", ResourceVersion: "100"}} + ap := &v1beta1.ApplicationProfile{ObjectMeta: metav1.ObjectMeta{Name: "override", Namespace: "default", ResourceVersion: "50"}} + nn := &v1beta1.NetworkNeighborhood{ObjectMeta: metav1.ObjectMeta{Name: "override", Namespace: "default", ResourceVersion: "60"}} + client := &countingProfileClient{cp: cp, ap: ap, nn: nn} + k8s := newControllableK8sCache() + metrics := newCountingMetrics() + c := newReconcilerCache(t, client, k8s, metrics) + + id := "c1" + entry := &CachedContainerProfile{ + Profile: cp, + State: &objectcache.ProfileState{Name: cp.Name}, + ContainerName: "nginx", + PodName: "nginx-abc", + Namespace: "default", + PodUID: "uid-1", + CPName: "cp", + UserAPRef: &namespacedName{Namespace: "default", Name: "override"}, + UserNNRef: &namespacedName{Namespace: "default", Name: "override"}, + Shared: false, + RV: "100", + UserAPRV: "50", + UserNNRV: "60", + } + c.entries.Set(id, entry) + beforeProfilePtr := entry.Profile + + c.refreshAllEntries(context.Background()) + + // Fetched CP once + overlays once each to check RVs; then fast-skipped. + assert.Equal(t, int64(1), client.cpCalls.Load(), "CP should be fetched once") + assert.Equal(t, int64(1), client.apCalls.Load(), "AP should be fetched once for RV check") + assert.Equal(t, int64(1), client.nnCalls.Load(), "NN should be fetched once for RV check") + + stored, ok := c.entries.Load(id) + require.True(t, ok) + // Same pointer: the entry was NOT rebuilt. + assert.Same(t, entry, stored, "entry must not be replaced on fast-skip") + assert.Same(t, beforeProfilePtr, stored.Profile, "Profile pointer must not change on fast-skip") + // No legacy-load metric emitted on fast-skip. + assert.Equal(t, 0, metrics.legacyLoad(kindApplication, completenessFull)) + assert.Equal(t, 0, metrics.legacyLoad(kindNetwork, completenessFull)) +} + +// TestRefreshRebuildsOnUserAPChange — entry has stale UserAPRV; refresh sees +// a newer AP RV and rebuilds. +func TestRefreshRebuildsOnUserAPChange(t *testing.T) { + cp := &v1beta1.ContainerProfile{ + ObjectMeta: metav1.ObjectMeta{Name: "cp", Namespace: "default", ResourceVersion: "100"}, + Spec: v1beta1.ContainerProfileSpec{Capabilities: []string{"SYS_PTRACE"}}, + } + ap := &v1beta1.ApplicationProfile{ + ObjectMeta: metav1.ObjectMeta{Name: "override", Namespace: "default", ResourceVersion: "51"}, + Spec: v1beta1.ApplicationProfileSpec{ + Containers: []v1beta1.ApplicationProfileContainer{{ + Name: "nginx", + Capabilities: []string{"NET_BIND_SERVICE"}, + }}, + }, + } + client := &countingProfileClient{cp: cp, ap: ap} + k8s := newControllableK8sCache() + metrics := newCountingMetrics() + c := newReconcilerCache(t, client, k8s, metrics) + + id := "c1" + entry := &CachedContainerProfile{ + Profile: cp, + State: &objectcache.ProfileState{Name: cp.Name}, + ContainerName: "nginx", + PodName: "nginx-abc", + Namespace: "default", + PodUID: "uid-1", + CPName: "cp", + UserAPRef: &namespacedName{Namespace: "default", Name: "override"}, + Shared: false, + RV: "100", + UserAPRV: "50", // stale: storage now returns 51 + } + c.entries.Set(id, entry) + + c.refreshAllEntries(context.Background()) + + stored, ok := c.entries.Load(id) + require.True(t, ok) + assert.NotSame(t, entry, stored, "entry must be replaced when user-AP RV changes") + assert.Equal(t, "51", stored.UserAPRV, "new UserAPRV must be recorded") + assert.ElementsMatch(t, []string{"SYS_PTRACE", "NET_BIND_SERVICE"}, stored.Profile.Spec.Capabilities, + "rebuilt projection must include merged overlay capabilities") +} + +// TestRefreshRebuildsOnCPChange — CP RV changed; entry rebuilds with fresh CP. +func TestRefreshRebuildsOnCPChange(t *testing.T) { + cp := &v1beta1.ContainerProfile{ + ObjectMeta: metav1.ObjectMeta{Name: "cp", Namespace: "default", ResourceVersion: "101"}, + Spec: v1beta1.ContainerProfileSpec{Capabilities: []string{"SYS_ADMIN"}}, + } + client := &countingProfileClient{cp: cp} + k8s := newControllableK8sCache() + metrics := newCountingMetrics() + c := newReconcilerCache(t, client, k8s, metrics) + + oldCP := &v1beta1.ContainerProfile{ + ObjectMeta: metav1.ObjectMeta{Name: "cp", Namespace: "default", ResourceVersion: "100"}, + } + id := "c1" + entry := newEntry(oldCP, "nginx", "nginx-abc", "default", "uid-1") + c.entries.Set(id, entry) + + c.refreshAllEntries(context.Background()) + + stored, ok := c.entries.Load(id) + require.True(t, ok) + assert.Equal(t, "101", stored.RV, "RV must update to the fresh CP's version") + assert.Same(t, cp, stored.Profile, "shared fast-path: fresh CP pointer stored directly") +} + +// TestT8_EndToEndRefreshUpdatesProjection — delta #5. Mutate the user-AP in +// the stubbed storage so its RV + execs change; assert the cached projection +// reflects the new execs AND that the legacy-load metric was re-emitted. +func TestT8_EndToEndRefreshUpdatesProjection(t *testing.T) { + cp := &v1beta1.ContainerProfile{ + ObjectMeta: metav1.ObjectMeta{Name: "cp", Namespace: "default", ResourceVersion: "100"}, + Spec: v1beta1.ContainerProfileSpec{ + Execs: []v1beta1.ExecCalls{{Path: "/bin/base", Args: []string{"a"}}}, + }, + } + ap := &v1beta1.ApplicationProfile{ + ObjectMeta: metav1.ObjectMeta{Name: "override", Namespace: "default", ResourceVersion: "50"}, + Spec: v1beta1.ApplicationProfileSpec{ + Containers: []v1beta1.ApplicationProfileContainer{{ + Name: "nginx", + Execs: []v1beta1.ExecCalls{{Path: "/bin/old", Args: []string{"x"}}}, + }}, + }, + } + client := &countingProfileClient{cp: cp, ap: ap} + k8s := newControllableK8sCache() + metrics := newCountingMetrics() + c := newReconcilerCache(t, client, k8s, metrics) + + // Initial entry built from base CP + overlay: use addContainer's private + // buildEntry logic via projectUserProfiles directly, then seed. + initialProjected, _ := projectUserProfiles(cp, ap, nil, nil, "nginx") + id := "c1" + entry := &CachedContainerProfile{ + Profile: initialProjected, + State: &objectcache.ProfileState{Name: cp.Name}, + ContainerName: "nginx", + PodName: "nginx-abc", + Namespace: "default", + PodUID: "uid-1", + CPName: "cp", + UserAPRef: &namespacedName{Namespace: "default", Name: "override"}, + Shared: false, + RV: "100", + UserAPRV: "50", + } + c.entries.Set(id, entry) + + // Mutate storage: new AP RV + new execs. + client.ap = &v1beta1.ApplicationProfile{ + ObjectMeta: metav1.ObjectMeta{Name: "override", Namespace: "default", ResourceVersion: "51"}, + Spec: v1beta1.ApplicationProfileSpec{ + Containers: []v1beta1.ApplicationProfileContainer{{ + Name: "nginx", + Execs: []v1beta1.ExecCalls{{Path: "/bin/new", Args: []string{"y"}}}, + }}, + }, + } + + c.refreshAllEntries(context.Background()) + + stored, ok := c.entries.Load(id) + require.True(t, ok) + assert.Equal(t, "51", stored.UserAPRV, "refresh must record the new user-AP RV") + + // The projection must include the new exec (merged on top of the base CP's exec). + var paths []string + for _, e := range stored.Profile.Spec.Execs { + paths = append(paths, e.Path) + } + assert.Contains(t, paths, "/bin/base", "base CP exec must be preserved") + assert.Contains(t, paths, "/bin/new", "new user-AP exec must be projected into the cache") + assert.NotContains(t, paths, "/bin/old", "stale user-AP exec must NOT be in the projection") + + assert.GreaterOrEqual(t, metrics.legacyLoad(kindApplication, completenessFull), 1, + "refresh with user-AP overlay must emit full-load metric") +} + +// TestRefreshNoEntryWhenCPGetFails — storage error on CP keeps the existing +// entry unchanged (no deletion). +func TestRefreshNoEntryWhenCPGetFails(t *testing.T) { + cp := &v1beta1.ContainerProfile{ObjectMeta: metav1.ObjectMeta{Name: "cp", Namespace: "default", ResourceVersion: "100"}} + failing := &failingProfileClient{cpErr: assertErr{}} + k8s := newControllableK8sCache() + metrics := newCountingMetrics() + c := newReconcilerCache(t, failing, k8s, metrics) + + id := "c1" + entry := newEntry(cp, "nginx", "nginx-abc", "default", "uid-1") + c.entries.Set(id, entry) + + c.refreshAllEntries(context.Background()) + + stored, ok := c.entries.Load(id) + require.True(t, ok, "CP fetch error must not delete the entry") + assert.Same(t, entry, stored, "entry pointer must not change when CP fetch fails") +} + +// TestRefreshPreservesEntryOnTransientOverlayError — overlay fetch errors must +// not strip overlay data from the cache. If a user-managed or user-defined +// AP/NN GET returns an error while the entry already has a non-empty cached RV +// for that overlay, refreshOneEntry must keep the old entry unchanged (same +// pointer) rather than rebuilding without the overlay and clearing its RV. +// Regression test for the refreshRPC timeout → silent nil → spurious rebuild path. +func TestRefreshPreservesEntryOnTransientOverlayError(t *testing.T) { + cp := &v1beta1.ContainerProfile{ + ObjectMeta: metav1.ObjectMeta{Name: "cp", Namespace: "default", ResourceVersion: "100"}, + Spec: v1beta1.ContainerProfileSpec{Capabilities: []string{"SYS_PTRACE"}}, + } + + type overlayFields struct { + workloadName string + userManagedAPRV string + userManagedNNRV string + userAPRef *namespacedName + userAPRV string + userNNRef *namespacedName + userNNRV string + } + tests := []struct { + name string + apErr bool + nnErr bool + overlay overlayFields + }{ + { + name: "user-managed AP timeout preserves entry", + apErr: true, + overlay: overlayFields{ + workloadName: "nginx", + userManagedAPRV: "9", + }, + }, + { + name: "user-managed NN timeout preserves entry", + nnErr: true, + overlay: overlayFields{ + workloadName: "nginx", + userManagedNNRV: "7", + }, + }, + { + name: "user-defined AP timeout preserves entry", + apErr: true, + overlay: overlayFields{ + userAPRef: &namespacedName{Namespace: "default", Name: "override"}, + userAPRV: "50", + }, + }, + { + name: "user-defined NN timeout preserves entry", + nnErr: true, + overlay: overlayFields{ + userNNRef: &namespacedName{Namespace: "default", Name: "override"}, + userNNRV: "60", + }, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + apErr := error(nil) + if tc.apErr { + apErr = assertErr{} + } + nnErr := error(nil) + if tc.nnErr { + nnErr = assertErr{} + } + client := &overlayErrorClient{cp: cp, apErr: apErr, nnErr: nnErr} + k8s := newControllableK8sCache() + c := newReconcilerCache(t, client, k8s, nil) + + id := "c1" + entry := &CachedContainerProfile{ + Profile: cp, + State: &objectcache.ProfileState{Name: cp.Name}, + ContainerName: "nginx", + PodName: "nginx-abc", + Namespace: "default", + PodUID: "uid-1", + CPName: "cp", + RV: "100", + WorkloadName: tc.overlay.workloadName, + UserManagedAPRV: tc.overlay.userManagedAPRV, + UserManagedNNRV: tc.overlay.userManagedNNRV, + UserAPRef: tc.overlay.userAPRef, + UserAPRV: tc.overlay.userAPRV, + UserNNRef: tc.overlay.userNNRef, + UserNNRV: tc.overlay.userNNRV, + Shared: false, + } + c.entries.Set(id, entry) + + c.refreshAllEntries(context.Background()) + + stored, ok := c.entries.Load(id) + require.True(t, ok, "overlay error must not delete the entry") + assert.Same(t, entry, stored, "entry pointer must not change when overlay fetch fails transiently") + // Overlay RVs must be unchanged (not cleared to ""). + assert.Equal(t, tc.overlay.userManagedAPRV, stored.UserManagedAPRV) + assert.Equal(t, tc.overlay.userManagedNNRV, stored.UserManagedNNRV) + assert.Equal(t, tc.overlay.userAPRV, stored.UserAPRV) + assert.Equal(t, tc.overlay.userNNRV, stored.UserNNRV) + }) + } +} + +// overlayErrorClient returns a valid CP but fails AP/NN calls with the +// configured errors. Used to test overlay error-preservation logic. +type overlayErrorClient struct { + cp *v1beta1.ContainerProfile + apErr error + nnErr error +} + +var _ storage.ProfileClient = (*overlayErrorClient)(nil) + +func (o *overlayErrorClient) GetContainerProfile(_ context.Context, _, _ string) (*v1beta1.ContainerProfile, error) { + return o.cp, nil +} +func (o *overlayErrorClient) GetApplicationProfile(_ context.Context, _, _ string) (*v1beta1.ApplicationProfile, error) { + return nil, o.apErr +} +func (o *overlayErrorClient) GetNetworkNeighborhood(_ context.Context, _, _ string) (*v1beta1.NetworkNeighborhood, error) { + return nil, o.nnErr +} +func (o *overlayErrorClient) ListApplicationProfiles(_ context.Context, _ string, _ int64, _ string) (*v1beta1.ApplicationProfileList, error) { + return &v1beta1.ApplicationProfileList{}, nil +} +func (o *overlayErrorClient) ListNetworkNeighborhoods(_ context.Context, _ string, _ int64, _ string) (*v1beta1.NetworkNeighborhoodList, error) { + return &v1beta1.NetworkNeighborhoodList{}, nil +} + +// --- helpers --- + +// itoa is a local int-to-string so tests don't pull in strconv just for one +// call site. +func itoa(i int) string { + if i == 0 { + return "0" + } + neg := i < 0 + if neg { + i = -i + } + buf := [20]byte{} + pos := len(buf) + for i > 0 { + pos-- + buf[pos] = byte('0' + i%10) + i /= 10 + } + if neg { + pos-- + buf[pos] = '-' + } + return string(buf[pos:]) +} + +// assertErr is a trivial error sentinel used in a few negative tests. +type assertErr struct{} + +func (assertErr) Error() string { return "synthetic error" } + +// failingProfileClient always returns cpErr from GetContainerProfile. +type failingProfileClient struct { + cpErr error +} + +var _ storage.ProfileClient = (*failingProfileClient)(nil) + +func (f *failingProfileClient) GetContainerProfile(_ context.Context, _, _ string) (*v1beta1.ContainerProfile, error) { + return nil, f.cpErr +} +func (f *failingProfileClient) GetApplicationProfile(_ context.Context, _, _ string) (*v1beta1.ApplicationProfile, error) { + return nil, nil +} +func (f *failingProfileClient) GetNetworkNeighborhood(_ context.Context, _, _ string) (*v1beta1.NetworkNeighborhood, error) { + return nil, nil +} +func (f *failingProfileClient) ListApplicationProfiles(_ context.Context, _ string, _ int64, _ string) (*v1beta1.ApplicationProfileList, error) { + return &v1beta1.ApplicationProfileList{}, nil +} +func (f *failingProfileClient) ListNetworkNeighborhoods(_ context.Context, _ string, _ int64, _ string) (*v1beta1.NetworkNeighborhoodList, error) { + return &v1beta1.NetworkNeighborhoodList{}, nil +} + +// silence unused-import linter: helpersv1 is referenced only via the const in +// containerprofilecache.go (used by some entries). Import explicitly so the +// file compiles without the import when those constants aren't dereferenced. +var _ = helpersv1.CompletionMetadataKey + +// TestRefreshHonorsContextCancellationMidRPC verifies that a context +// cancellation while refreshOneEntry is blocked in GetContainerProfile +// causes the refresh to return within the rpcBudget, not hang for the +// full reconciler timeout. +func TestRefreshHonorsContextCancellationMidRPC(t *testing.T) { + // Buffered so the signal is stored even if the test's <-blocked read is + // slightly delayed — prevents a lossy non-blocking send from dropping it. + blocked := make(chan struct{}, 1) + unblock := make(chan struct{}) + blocking := &blockingProfileClient{ + blocked: blocked, + unblock: unblock, + } + cp := &v1beta1.ContainerProfile{ + ObjectMeta: metav1.ObjectMeta{Name: "cp-1", Namespace: "default", ResourceVersion: "42"}, + } + // Seed an existing entry so refreshOneEntry attempts a CP re-fetch. + k8s := newControllableK8sCache() + cfg := config.Config{ + ProfilesCacheRefreshRate: 30 * time.Second, + StorageRPCBudget: 100 * time.Millisecond, + } + cache := NewContainerProfileCache(cfg, blocking, k8s, nil) + cache.SeedEntryForTest("id1", &CachedContainerProfile{ + Profile: cp, + State: &objectcache.ProfileState{Name: cp.Name}, + ContainerName: "c1", + PodName: "pod1", + Namespace: "default", + PodUID: "uid1", + CPName: "cp-1", + RV: "old-rv", // differs from cp.RV so fast-skip is skipped + }) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + done := make(chan struct{}) + go func() { + defer close(done) + cache.refreshAllEntries(ctx) + }() + + // Wait for the RPC to block, then cancel the context. + <-blocked + cancel() + + // The refresh must return within 2s of cancellation (well above the + // 100ms rpcBudget; the generous budget accommodates loaded CI runners). + select { + case <-done: + case <-time.After(2 * time.Second): + t.Fatal("refreshAllEntries did not return after context cancellation") + } + close(unblock) +} + +// blockingProfileClient blocks GetContainerProfile until unblocked. +type blockingProfileClient struct { + blocked chan struct{} + unblock chan struct{} +} + +var _ storage.ProfileClient = (*blockingProfileClient)(nil) + +func (b *blockingProfileClient) GetContainerProfile(ctx context.Context, _, _ string) (*v1beta1.ContainerProfile, error) { + b.blocked <- struct{}{} // buffered(1): stored if reader hasn't arrived yet + select { + case <-b.unblock: + return nil, nil + case <-ctx.Done(): + return nil, ctx.Err() + } +} +func (b *blockingProfileClient) GetApplicationProfile(_ context.Context, _, _ string) (*v1beta1.ApplicationProfile, error) { + return nil, nil +} +func (b *blockingProfileClient) GetNetworkNeighborhood(_ context.Context, _, _ string) (*v1beta1.NetworkNeighborhood, error) { + return nil, nil +} +func (b *blockingProfileClient) ListApplicationProfiles(_ context.Context, _ string, _ int64, _ string) (*v1beta1.ApplicationProfileList, error) { + return &v1beta1.ApplicationProfileList{}, nil +} +func (b *blockingProfileClient) ListNetworkNeighborhoods(_ context.Context, _ string, _ int64, _ string) (*v1beta1.NetworkNeighborhoodList, error) { + return &v1beta1.NetworkNeighborhoodList{}, nil +} + +// TestRetryPendingEntries_CPCreatedAfterAdd exercises the bug that slipped +// through PR #788 component tests: at EventTypeAddContainer the CP may not +// yet be in storage (it is created asynchronously by containerprofilemanager +// after observing the container). The new cache must retry per reconciler +// tick; otherwise the container is permanently absent from the cache and +// rule evaluation short-circuits as "no profile". +func TestRetryPendingEntries_CPCreatedAfterAdd(t *testing.T) { + cp := &v1beta1.ContainerProfile{ + ObjectMeta: metav1.ObjectMeta{ + Name: "cp-pending", + Namespace: "default", + ResourceVersion: "1", + }, + } + + // Start with storage returning 404 for the initial GET. + client := &fakeProfileClient{cp: nil, cpErr: assertErrNotFound("cp-pending")} + c, k8s := newTestCache(t, client) + + id := "container-pending" + primeSharedData(t, k8s, id, "wlid://cluster-a/namespace-default/deployment-nginx") + + // addContainer: sees 404 -> pending bookkeeping, not an entry. + require.NoError(t, c.addContainer(eventContainer(id), context.Background())) + assert.Nil(t, c.GetContainerProfile(id), "no entry before CP exists in storage") + assert.Equal(t, 1, c.pending.Len(), "container recorded as pending") + + // Storage creates the CP asynchronously (60s after start in real runs). + client.cp = cp + client.cpErr = nil + + // Simulate one reconciler tick. retryPendingEntries iterates pending and + // promotes on successful GET. + c.retryPendingEntries(context.Background()) + + assert.NotNil(t, c.GetContainerProfile(id), "entry promoted after CP appears") + assert.Equal(t, 0, c.pending.Len(), "pending drained on successful promotion") + // Exactly two GETs: one from addContainer (404), one from retry (200). + assert.Equal(t, 2, client.getCPCalls, "retry should only re-GET once per tick") +} + +// TestPendingEntriesAreNotGCedBeforeRetry verifies we no longer drop pending +// entries from reconcileOnce. The component-tests regression (CI run +// 24781030436 on ce329196) showed the k8s pod cache and container statuses +// lag the containerwatcher Add event by tens of seconds on busy nodes, so a +// pod-state-driven GC dropped every pending entry before retries had a +// chance to succeed. Cleanup now flows exclusively through deleteContainer. +func TestPendingEntriesAreNotGCedBeforeRetry(t *testing.T) { + client := &fakeProfileClient{cp: nil, cpErr: assertErrNotFound("cp-missing")} + c, k8s := newTestCache(t, client) + _ = k8s + + id := "container-pending" + primeSharedData(t, k8s, id, "wlid://cluster-a/namespace-default/deployment-nginx") + require.NoError(t, c.addContainer(eventContainer(id), context.Background())) + require.Equal(t, 1, c.pending.Len()) + + // Several reconciler passes with nil-returning GetPod must leave the + // pending entry in place so retry has a chance to succeed once profile + // data shows up in storage. + for range 3 { + c.reconcileOnce(context.Background()) + } + assert.Equal(t, 1, c.pending.Len(), "pending entry retained across reconcile ticks") + + // Only deleteContainer clears pending. + c.deleteContainer(id) + assert.Equal(t, 0, c.pending.Len(), "deleteContainer clears pending") +} + +// assertErrNotFound is a minimal non-nil error for GET failures in tests. +// Using a sentinel keeps the test readable without pulling in apierrors. +func assertErrNotFound(name string) error { + return &testNotFoundErr{name: name} +} + +type testNotFoundErr struct{ name string } + +func (e *testNotFoundErr) Error() string { return "container profile " + e.name + ": not found" } + +// TestPartialCP_NonPreRunning_StaysPending verifies that a CP marked partial +// is NOT cached when the container is not PreRunning (i.e. started after the +// agent was up). Legacy caches explicitly deleted partials on restart; we +// mirror that by staying pending until the CP becomes Full. +func TestPartialCP_NonPreRunning_StaysPending(t *testing.T) { + cp := &v1beta1.ContainerProfile{ + ObjectMeta: metav1.ObjectMeta{ + Name: "cp-partial", + Namespace: "default", + ResourceVersion: "1", + Annotations: map[string]string{ + helpersv1.CompletionMetadataKey: helpersv1.Partial, + helpersv1.StatusMetadataKey: helpersv1.Completed, + }, + }, + } + client := &fakeProfileClient{cp: cp} + c, k8s := newTestCache(t, client) + + id := "container-partial-restart" + primeSharedData(t, k8s, id, "wlid://cluster-a/namespace-default/deployment-nginx") + // sharedData.PreRunningContainer is false by default → this simulates a + // fresh container start observed by a running agent. + + require.NoError(t, c.addContainer(eventContainer(id), context.Background())) + assert.Nil(t, c.GetContainerProfile(id), "partial CP must not populate cache on fresh container") + assert.Equal(t, 1, c.pending.Len(), "partial-on-restart stays pending") + + // Simulate the CP becoming Full (new agent-side aggregation round). + cp.Annotations[helpersv1.CompletionMetadataKey] = helpersv1.Full + cp.ResourceVersion = "2" + c.retryPendingEntries(context.Background()) + + assert.NotNil(t, c.GetContainerProfile(id), "Full CP promotes pending entry") + assert.Equal(t, 0, c.pending.Len(), "pending drained on Full") +} + +// TestPartialCP_PreRunning_Accepted verifies the inverse: when the agent +// restarts (all containers become PreRunning), we accept even a partial CP so +// rule evaluation can still alert on out-of-profile behavior (Test_19 +// semantics). +func TestPartialCP_PreRunning_Accepted(t *testing.T) { + cp := &v1beta1.ContainerProfile{ + ObjectMeta: metav1.ObjectMeta{ + Name: "cp-partial-prerunning", + Namespace: "default", + ResourceVersion: "1", + Annotations: map[string]string{ + helpersv1.CompletionMetadataKey: helpersv1.Partial, + helpersv1.StatusMetadataKey: helpersv1.Completed, + }, + }, + } + client := &fakeProfileClient{cp: cp} + c, k8s := newTestCache(t, client) + + id := "container-partial-prerunning" + // Mark PreRunning so the partial is accepted. + primePreRunningSharedData(t, k8s, id, "wlid://cluster-a/namespace-default/deployment-nginx") + + require.NoError(t, c.addContainer(eventContainer(id), context.Background())) + assert.NotNil(t, c.GetContainerProfile(id), "partial CP accepted for PreRunning container") + assert.Equal(t, 0, c.pending.Len(), "not pending when accepted") +} + +// TestOverlayLabel_TransientFetchFailure_RefsRetained verifies that when +// UserDefinedProfileMetadataKey is set but the user-AP/NN fetch fails, the +// entry still records UserAPRef / UserNNRef so the refresh loop can re-fetch +// on subsequent ticks instead of permanently dropping the overlay. +func TestOverlayLabel_TransientFetchFailure_RefsRetained(t *testing.T) { + cp := &v1beta1.ContainerProfile{ + ObjectMeta: metav1.ObjectMeta{Name: "cp-with-overlay", Namespace: "default", ResourceVersion: "1"}, + } + // Overlay fetch returns an error; the base CP is fine. + client := &fakeProfileClient{cp: cp, apErr: assertErrNotFound("override"), nnErr: assertErrNotFound("override")} + c, k8s := newTestCache(t, client) + + id := "container-transient-overlay" + primeSharedData(t, k8s, id, "wlid://cluster-a/namespace-default/deployment-nginx") + + // Build the container with the overlay label set. + ct := eventContainer(id) + ct.K8s.PodLabels = map[string]string{helpersv1.UserDefinedProfileMetadataKey: "override"} + + require.NoError(t, c.addContainer(ct, context.Background())) + + entry, ok := c.entries.Load(id) + require.True(t, ok, "entry stored with base CP even if overlay fetch failed") + require.NotNil(t, entry.UserAPRef, "UserAPRef retained for refresh retry") + require.NotNil(t, entry.UserNNRef, "UserNNRef retained for refresh retry") + assert.Equal(t, "override", entry.UserAPRef.Name) + assert.Equal(t, "override", entry.UserNNRef.Name) +} + +// TestRefreshDoesNotResurrectDeletedEntry verifies the Phase-4 reviewer race: +// refreshAllEntries snapshots entries without a lock; if deleteContainer +// removes the entry before refreshOneEntry takes the lock, the refresh must +// NOT re-insert it. +func TestRefreshDoesNotResurrectDeletedEntry(t *testing.T) { + cp := &v1beta1.ContainerProfile{ + ObjectMeta: metav1.ObjectMeta{Name: "cp-resurrect", Namespace: "default", ResourceVersion: "1"}, + } + client := &fakeProfileClient{cp: cp} + c, k8s := newTestCache(t, client) + + id := "container-resurrect" + primeSharedData(t, k8s, id, "wlid://cluster-a/namespace-default/deployment-nginx") + require.NoError(t, c.addContainer(eventContainer(id), context.Background())) + require.NotNil(t, c.GetContainerProfile(id)) + + // Simulate the race: snapshot the entry, delete, then call refreshOneEntry. + entry, ok := c.entries.Load(id) + require.True(t, ok) + c.deleteContainer(id) + require.Nil(t, c.GetContainerProfile(id), "entry gone after delete") + + // Refresh for the deleted id must bail instead of resurrecting. + c.containerLocks.WithLock(id, func() { + c.refreshOneEntry(context.Background(), id, entry) + }) + + assert.Nil(t, c.GetContainerProfile(id), "refresh must not resurrect deleted entry") +} + +// TestUserDefinedProfileOnly_NoBaseCP verifies that a container with only a +// user-defined AP/NN (no base CP yet) still gets a cache entry, mirroring the +// legacy behavior where user-defined profiles were stored directly. +func TestUserDefinedProfileOnly_NoBaseCP(t *testing.T) { + userAP := &v1beta1.ApplicationProfile{ + ObjectMeta: metav1.ObjectMeta{Name: "user-override", Namespace: "default", ResourceVersion: "10"}, + Spec: v1beta1.ApplicationProfileSpec{ + Containers: []v1beta1.ApplicationProfileContainer{ + {Name: "nginx", Capabilities: []string{"CAP_NET_ADMIN"}}, + }, + }, + } + // Base CP fetch fails (404); only the overlay exists. + client := &fakeProfileClient{cp: nil, cpErr: assertErrNotFound("no-base"), ap: userAP} + c, k8s := newTestCache(t, client) + + id := "container-user-only" + primeSharedData(t, k8s, id, "wlid://cluster-a/namespace-default/deployment-nginx") + ct := eventContainer(id) + ct.K8s.PodLabels = map[string]string{helpersv1.UserDefinedProfileMetadataKey: "user-override"} + + require.NoError(t, c.addContainer(ct, context.Background())) + + cached := c.GetContainerProfile(id) + require.NotNil(t, cached, "entry populated from user-AP even without base CP") + // The synthesized CP + projection should carry the user AP's capabilities. + assert.Contains(t, cached.Spec.Capabilities, "CAP_NET_ADMIN") +} + +// primePreRunningSharedData is a variant of primeSharedData that sets the +// PreRunningContainer flag. +func primePreRunningSharedData(t *testing.T, k8s *objectcache.K8sObjectCacheMock, containerID, wlid string) { + t.Helper() + primeSharedData(t, k8s, containerID, wlid) + existing := k8s.GetSharedContainerData(containerID) + require.NotNil(t, existing) + existing.PreRunningContainer = true + k8s.SetSharedContainerData(containerID, existing) +} + +// TestRefreshUpdatesCPStatus exercises the refresh path: at addContainer +// time the consolidated CP may still be in Status="ready"; the cache must +// re-fetch it on each tick so a later "ready" -> "completed" transition +// propagates to the cached ProfileState, which in turn flips fail_on_profile +// from false to true (Test_17 / Test_19 semantics). +func TestRefreshUpdatesCPStatus(t *testing.T) { + cp := &v1beta1.ContainerProfile{ + ObjectMeta: metav1.ObjectMeta{ + Name: "cp-ready", + Namespace: "default", + ResourceVersion: "1", + Annotations: map[string]string{ + helpersv1.CompletionMetadataKey: helpersv1.Full, + helpersv1.StatusMetadataKey: helpersv1.Learning, // "ready" + }, + }, + } + client := &fakeProfileClient{cp: cp} + c, k8s := newTestCache(t, client) + + id := "container-cp-ready" + primeSharedData(t, k8s, id, "wlid://cluster-a/namespace-default/deployment-nginx") + require.NoError(t, c.addContainer(eventContainer(id), context.Background())) + + entry, ok := c.entries.Load(id) + require.True(t, ok, "entry populated from CP") + require.NotNil(t, entry.State) + assert.Equal(t, helpersv1.Learning, entry.State.Status, + "Status reflects the CP at add time (ready / learning)") + + // Storage transitions CP to Status=completed. + client.cp = &v1beta1.ContainerProfile{ + ObjectMeta: metav1.ObjectMeta{ + Name: "cp-ready", + Namespace: "default", + ResourceVersion: "2", + Annotations: map[string]string{ + helpersv1.CompletionMetadataKey: helpersv1.Full, + helpersv1.StatusMetadataKey: helpersv1.Completed, + }, + }, + } + + c.refreshAllEntries(context.Background()) + + stored, ok := c.entries.Load(id) + require.True(t, ok) + require.NotNil(t, stored.State) + assert.Equal(t, helpersv1.Completed, stored.State.Status, + "refresh propagates CP Status=completed into ProfileState") + assert.Equal(t, "2", stored.RV, "refresh records the new CP RV") +} + +// TestUserManagedProfileMerged exercises the user-managed merge path +// (Test_12_MergingProfilesTest / Test_13_MergingNetworkNeighborhoodTest): +// a user-managed AP published at "ug-" is merged on top of +// the base CP. Anomalies NOT in the union of base + user-managed should +// produce alerts; anomalies present in either source should not. +func TestUserManagedProfileMerged(t *testing.T) { + // Base CP has exec "/bin/X"; user-managed AP adds "/bin/Y". + cp := &v1beta1.ContainerProfile{ + ObjectMeta: metav1.ObjectMeta{ + Name: "cp-base", + Namespace: "default", + ResourceVersion: "1", + Annotations: map[string]string{ + helpersv1.CompletionMetadataKey: helpersv1.Full, + helpersv1.StatusMetadataKey: helpersv1.Completed, + }, + }, + Spec: v1beta1.ContainerProfileSpec{ + Execs: []v1beta1.ExecCalls{{Path: "/bin/X"}}, + }, + } + userManagedAP := &v1beta1.ApplicationProfile{ + ObjectMeta: metav1.ObjectMeta{ + Name: "ug-nginx", + Namespace: "default", + ResourceVersion: "9", + Annotations: map[string]string{ + helpersv1.CompletionMetadataKey: helpersv1.Full, + helpersv1.StatusMetadataKey: helpersv1.Completed, + }, + }, + Spec: v1beta1.ApplicationProfileSpec{ + Containers: []v1beta1.ApplicationProfileContainer{{ + Name: "nginx", + Execs: []v1beta1.ExecCalls{{Path: "/bin/Y"}}, + }}, + }, + } + client := &fakeProfileClient{ + cp: cp, + userManagedAP: userManagedAP, + } + c, k8s := newTestCache(t, client) + + id := "container-user-managed" + primeSharedData(t, k8s, id, "wlid://cluster-a/namespace-default/deployment-nginx") + require.NoError(t, c.addContainer(eventContainer(id), context.Background())) + + cached := c.GetContainerProfile(id) + require.NotNil(t, cached, "entry populated") + var paths []string + for _, e := range cached.Spec.Execs { + paths = append(paths, e.Path) + } + assert.Contains(t, paths, "/bin/X", "base workload AP exec must be present") + assert.Contains(t, paths, "/bin/Y", "user-managed (ug-) AP exec must be merged in") + + // Verify the RV was captured so a later user-managed update would trigger + // a refresh rebuild. + entry, ok := c.entries.Load(id) + require.True(t, ok) + assert.Equal(t, "9", entry.UserManagedAPRV, "UserManagedAPRV recorded at add time") +} diff --git a/pkg/objectcache/containerprofilecache/shared_pointer_race_test.go b/pkg/objectcache/containerprofilecache/shared_pointer_race_test.go new file mode 100644 index 0000000000..5fe4dffa60 --- /dev/null +++ b/pkg/objectcache/containerprofilecache/shared_pointer_race_test.go @@ -0,0 +1,210 @@ +package containerprofilecache_test + +// TestSharedPointerReadersDoNotCorruptCache — PR 3 Part A. +// +// Validates that concurrent readers and a concurrent reconciler-refresh do not +// produce data races on the shared *v1beta1.ContainerProfile pointer returned +// by GetContainerProfile. +// +// Design: +// - Seed a cache entry backed by cpV1 (RV="1"). Storage serves cpV2 (RV="2") +// so every RefreshAllEntriesForTest call triggers a rebuild (atomic pointer +// swap on the entries map, no in-place mutation of the old slice). +// - 50 reader goroutines call GetContainerProfile in a tight loop and iterate +// the returned Spec.Execs, Spec.Opens, Spec.Capabilities slices READ-ONLY. +// - 1 writer goroutine alternates: RefreshAllEntriesForTest (triggers rebuild) +// then SeedEntryForTest (resets RV to "1" so the next refresh rebuilds again). +// - Run for 500ms under -race. The race detector will surface any unprotected +// concurrent read/write pair. If none fires, the shared-pointer fast-path is +// demonstrably safe for read-only consumers. +// +// NOTE: deliberately-mutating consumer (anti-pattern) is NOT tested here because +// it is expected to trigger the race detector and would make CI non-deterministic. +// That pattern is covered by the code-review gate enforced by ReadOnlyCP (Part B). + +import ( + "context" + "runtime" + "sync" + "testing" + "time" + + "github.com/kubescape/node-agent/pkg/config" + "github.com/kubescape/node-agent/pkg/objectcache" + cpc "github.com/kubescape/node-agent/pkg/objectcache/containerprofilecache" + "github.com/kubescape/storage/pkg/apis/softwarecomposition/v1beta1" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func TestSharedPointerReadersDoNotCorruptCache(t *testing.T) { + const ( + id = "race-container" + numReaders = 50 + testDuration = 500 * time.Millisecond + rpcBudgetMs = 100 * time.Millisecond + ) + + // cpV1 — what is seeded initially (RV="1") + cpV1 := &v1beta1.ContainerProfile{ + ObjectMeta: metav1.ObjectMeta{ + Name: "cp-race", + Namespace: "default", + ResourceVersion: "1", + }, + Spec: v1beta1.ContainerProfileSpec{ + Execs: []v1beta1.ExecCalls{{Path: "/bin/sh", Args: []string{"a", "b", "c"}}}, + Opens: []v1beta1.OpenCalls{{Path: "/etc/passwd", Flags: []string{"O_RDONLY"}}}, + Capabilities: []string{"CAP_NET_ADMIN", "CAP_SYS_PTRACE"}, + }, + } + + // cpV2 — what storage returns after a refresh (RV="2"); the reconciler will + // create a brand-new entry pointing to cpV2 (never mutating cpV1). + cpV2 := &v1beta1.ContainerProfile{ + ObjectMeta: metav1.ObjectMeta{ + Name: "cp-race", + Namespace: "default", + ResourceVersion: "2", + }, + Spec: v1beta1.ContainerProfileSpec{ + Execs: []v1beta1.ExecCalls{{Path: "/bin/bash", Args: []string{"x", "y"}}}, + Opens: []v1beta1.OpenCalls{{Path: "/etc/shadow", Flags: []string{"O_WRONLY"}}}, + Capabilities: []string{"CAP_CHOWN"}, + }, + } + + store := newFakeStorage(cpV2) // storage always returns cpV2 + k8s := newFakeK8sCache() + + cfg := config.Config{ + ProfilesCacheRefreshRate: 30 * time.Second, + StorageRPCBudget: rpcBudgetMs, + } + cache := cpc.NewContainerProfileCache(cfg, store, k8s, nil) + + seedV1 := func() { + cache.SeedEntryForTest(id, &cpc.CachedContainerProfile{ + Profile: cpV1, + State: &objectcache.ProfileState{Name: "cp-race"}, + ContainerName: "container", + PodName: "pod-race", + Namespace: "default", + PodUID: "uid-race", + CPName: "cp-race", + RV: "1", // stale — guarantees refresh rebuilds on each tick + Shared: true, + }) + } + + // Pre-warm SafeMap so concurrent Load never hits the nil-check-before-lock + // initialization race present in goradd/maps v1.3.0 (pre-existing upstream bug). + seedV1() + + require.NotNil(t, cache.GetContainerProfile(id), "pre-condition: entry present before test") + + ctx, cancel := context.WithTimeout(context.Background(), testDuration) + defer cancel() + + var wg sync.WaitGroup + + // 50 reader goroutines — read-only traversal of the returned profile. + wg.Add(numReaders) + for i := 0; i < numReaders; i++ { + go func() { + defer wg.Done() + for ctx.Err() == nil { + cp := cache.GetContainerProfile(id) + if cp == nil { + runtime.Gosched() + continue + } + // Read-only: iterate slices without writing. + for _, e := range cp.Spec.Execs { + _ = e.Path + _ = len(e.Args) + } + for _, o := range cp.Spec.Opens { + _ = o.Path + _ = len(o.Flags) + } + _ = len(cp.Spec.Capabilities) + _ = cp.ResourceVersion + runtime.Gosched() + } + }() + } + + // 1 writer goroutine: alternate refresh (rebuilds entry → cpV2) and reset + // (reseeds entry → cpV1) to keep the refresh loop active across the window. + wg.Add(1) + go func() { + defer wg.Done() + for ctx.Err() == nil { + cache.RefreshAllEntriesForTest(ctx) + // Reset to cpV1 so the next refresh sees a stale RV and rebuilds again. + seedV1() + } + }() + + wg.Wait() + + // If the race detector fired, the test is already marked as failed. We add + // an explicit liveness assertion to guard against a scenario where the entry + // gets permanently nil-ed out by a refresh bug. + finalCP := cache.GetContainerProfile(id) + // Entry may legitimately be nil if the last operation was a refresh that + // returned cpV2 and then another seedV1 race lost; what we must NOT see is + // a panic above or a non-nil entry with a nil Profile. + if finalCP != nil { + assert.NotEmpty(t, finalCP.ResourceVersion, "final cached entry must have a non-empty RV") + } +} + +// TestSharedPointerFastPathPreservesPointerIdentity verifies that when the +// reconciler rebuilds an entry from a storage pointer with no overlay, the +// new entry's Profile points directly to the storage object (Shared=true, +// no DeepCopy). This is the memory property that Part A is guarding — if it +// regresses to DeepCopy-on-every-refresh the T3 memory budget is blown. +func TestSharedPointerFastPathPreservesPointerIdentity(t *testing.T) { + cpInStorage := &v1beta1.ContainerProfile{ + ObjectMeta: metav1.ObjectMeta{ + Name: "cp-identity", + Namespace: "default", + ResourceVersion: "99", + }, + Spec: v1beta1.ContainerProfileSpec{ + Capabilities: []string{"CAP_NET_RAW"}, + }, + } + + store := newFakeStorage(cpInStorage) + k8s := newFakeK8sCache() + cfg := config.Config{ + ProfilesCacheRefreshRate: 30 * time.Second, + StorageRPCBudget: 100 * time.Millisecond, + } + cache := cpc.NewContainerProfileCache(cfg, store, k8s, nil) + + // Seed with a stale RV so the refresh rebuilds. + cache.SeedEntryForTest("id-identity", &cpc.CachedContainerProfile{ + Profile: cpInStorage, + State: &objectcache.ProfileState{Name: "cp-identity"}, + ContainerName: "container", + PodName: "pod-identity", + Namespace: "default", + PodUID: "uid-identity", + CPName: "cp-identity", + RV: "old", + Shared: true, + }) + + cache.RefreshAllEntriesForTest(context.Background()) + + got := cache.GetContainerProfile("id-identity") + require.NotNil(t, got, "entry must be present after refresh") + assert.Same(t, cpInStorage, got, + "shared fast-path: refresh must store the storage pointer directly (no DeepCopy)") + assert.Equal(t, "99", got.ResourceVersion, "RV must match the storage object") +} diff --git a/pkg/objectcache/containerprofilecache/t8_overlay_refresh_test.go b/pkg/objectcache/containerprofilecache/t8_overlay_refresh_test.go new file mode 100644 index 0000000000..ea67a5d172 --- /dev/null +++ b/pkg/objectcache/containerprofilecache/t8_overlay_refresh_test.go @@ -0,0 +1,110 @@ +package containerprofilecache_test + +// TestT8_EndToEndRefreshUpdatesProjection mirrors the same-named unit test from +// reconciler_test.go using only the public / test-helper API so it can live at +// the integration test level (tests/containerprofilecache/). +// +// Scenario: an entry backed by CP (RV=100) + user-AP overlay (RV=50) is seeded +// via SeedEntryWithOverlayForTest. Storage is mutated to serve a new AP +// (RV=51, different execs). A single RefreshAllEntriesForTest call must rebuild +// the projection so the cached execs reflect the new AP, not the stale one. + +import ( + "context" + "testing" + "time" + + "github.com/kubescape/node-agent/pkg/config" + "github.com/kubescape/node-agent/pkg/objectcache" + cpc "github.com/kubescape/node-agent/pkg/objectcache/containerprofilecache" + "github.com/kubescape/storage/pkg/apis/softwarecomposition/v1beta1" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func TestT8_EndToEndRefreshUpdatesProjection(t *testing.T) { + cp := &v1beta1.ContainerProfile{ + ObjectMeta: metav1.ObjectMeta{ + Name: "cp", + Namespace: "default", + ResourceVersion: "100", + }, + Spec: v1beta1.ContainerProfileSpec{ + Execs: []v1beta1.ExecCalls{{Path: "/bin/base", Args: []string{"a"}}}, + }, + } + apV1 := &v1beta1.ApplicationProfile{ + ObjectMeta: metav1.ObjectMeta{ + Name: "override", + Namespace: "default", + ResourceVersion: "50", + }, + Spec: v1beta1.ApplicationProfileSpec{ + Containers: []v1beta1.ApplicationProfileContainer{{ + Name: "nginx", + Execs: []v1beta1.ExecCalls{{Path: "/bin/old", Args: []string{"x"}}}, + }}, + }, + } + apV2 := &v1beta1.ApplicationProfile{ + ObjectMeta: metav1.ObjectMeta{ + Name: "override", + Namespace: "default", + ResourceVersion: "51", + }, + Spec: v1beta1.ApplicationProfileSpec{ + Containers: []v1beta1.ApplicationProfileContainer{{ + Name: "nginx", + Execs: []v1beta1.ExecCalls{{Path: "/bin/new", Args: []string{"y"}}}, + }}, + }, + } + + store := newFakeStorage(cp) + store.mu.Lock() + store.ap = apV1 + store.mu.Unlock() + + k8s := newFakeK8sCache() + cfg := config.Config{ + ProfilesCacheRefreshRate: 30 * time.Second, + StorageRPCBudget: 500 * time.Millisecond, + } + cache := cpc.NewContainerProfileCache(cfg, store, k8s, nil) + + const id = "c1" + // Seed a projected entry with a stale UserAPRV so refresh sees the RV change. + // The Profile here is just the base CP; the reconciler will re-project on refresh. + cache.SeedEntryWithOverlayForTest(id, &cpc.CachedContainerProfile{ + Profile: cp, + State: &objectcache.ProfileState{Name: cp.Name}, + ContainerName: "nginx", + PodName: "nginx-abc", + Namespace: "default", + PodUID: "uid-1", + CPName: "cp", + RV: "100", + UserAPRV: "50", // stale — triggers rebuild when storage returns RV=51 + Shared: false, + }, "default", "override", "", "") + + // Advance storage to apV2 (RV=51). The reconciler will see the RV mismatch + // and rebuild the projection from cp + apV2. + store.mu.Lock() + store.ap = apV2 + store.mu.Unlock() + + cache.RefreshAllEntriesForTest(context.Background()) + + stored := cache.GetContainerProfile(id) + require.NotNil(t, stored, "entry must remain after refresh") + + var paths []string + for _, e := range stored.Spec.Execs { + paths = append(paths, e.Path) + } + assert.Contains(t, paths, "/bin/base", "base CP exec must be preserved after overlay refresh") + assert.Contains(t, paths, "/bin/new", "new user-AP exec must appear in the rebuilt projection") + assert.NotContains(t, paths, "/bin/old", "stale user-AP exec must NOT survive the rebuild") +} diff --git a/pkg/objectcache/containerprofilecache_interface.go b/pkg/objectcache/containerprofilecache_interface.go new file mode 100644 index 0000000000..fcf73ab9e9 --- /dev/null +++ b/pkg/objectcache/containerprofilecache_interface.go @@ -0,0 +1,41 @@ +// Package objectcache defines interfaces for the node-agent object cache layer. +package objectcache + +import ( + "context" + "errors" + + containercollection "github.com/inspektor-gadget/inspektor-gadget/pkg/container-collection" + "github.com/kubescape/node-agent/pkg/objectcache/callstackcache" + "github.com/kubescape/storage/pkg/apis/softwarecomposition/v1beta1" +) + +type ContainerProfileCache interface { + GetContainerProfile(containerID string) *v1beta1.ContainerProfile + GetContainerProfileState(containerID string) *ProfileState + GetCallStackSearchTree(containerID string) *callstackcache.CallStackSearchTree + ContainerCallback(notif containercollection.PubSubEvent) + Start(ctx context.Context) +} + +var _ ContainerProfileCache = (*ContainerProfileCacheMock)(nil) + +type ContainerProfileCacheMock struct{} + +func (cp *ContainerProfileCacheMock) GetContainerProfile(_ string) *v1beta1.ContainerProfile { + return nil +} + +func (cp *ContainerProfileCacheMock) GetContainerProfileState(_ string) *ProfileState { + return &ProfileState{Error: errors.New("mock: profile not found")} +} + +func (cp *ContainerProfileCacheMock) GetCallStackSearchTree(_ string) *callstackcache.CallStackSearchTree { + return nil +} + +func (cp *ContainerProfileCacheMock) ContainerCallback(_ containercollection.PubSubEvent) { +} + +func (cp *ContainerProfileCacheMock) Start(_ context.Context) { +} diff --git a/pkg/objectcache/networkneighborhoodcache/networkneighborhoodcache.go b/pkg/objectcache/networkneighborhoodcache/networkneighborhoodcache.go deleted file mode 100644 index a64c1454b5..0000000000 --- a/pkg/objectcache/networkneighborhoodcache/networkneighborhoodcache.go +++ /dev/null @@ -1,949 +0,0 @@ -package networkneighborhoodcache - -import ( - "context" - "fmt" - "strings" - "sync" - "time" - - "github.com/cenkalti/backoff/v5" - mapset "github.com/deckarep/golang-set/v2" - "github.com/goradd/maps" - containercollection "github.com/inspektor-gadget/inspektor-gadget/pkg/container-collection" - "github.com/kubescape/go-logger" - "github.com/kubescape/go-logger/helpers" - helpersv1 "github.com/kubescape/k8s-interface/instanceidhandler/v1/helpers" - "github.com/armosec/armoapi-go/armotypes" - "github.com/kubescape/node-agent/pkg/config" - "github.com/kubescape/node-agent/pkg/exporters" - "github.com/kubescape/node-agent/pkg/objectcache" - "github.com/kubescape/node-agent/pkg/resourcelocks" - "github.com/kubescape/node-agent/pkg/rulemanager/types" - "github.com/kubescape/node-agent/pkg/signature" - "github.com/kubescape/node-agent/pkg/signature/profiles" - "github.com/kubescape/node-agent/pkg/storage" - "github.com/kubescape/node-agent/pkg/utils" - "github.com/kubescape/storage/pkg/apis/softwarecomposition/v1beta1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" -) - -// ContainerInfo holds container metadata we need for network neighborhood mapping -type ContainerInfo struct { - ContainerID string - WorkloadID string - InstanceTemplateHash string - Namespace string - SeenContainerFromTheStart bool // True if container was seen from the start - UserDefinedNetwork string // Non-empty when pod has a user-defined NN label -} - -// NetworkNeighborhoodCacheImpl implements the NetworkNeighborhoodCache interface -type NetworkNeighborhoodCacheImpl struct { - cfg config.Config - workloadIDToNetworkNeighborhood maps.SafeMap[string, *v1beta1.NetworkNeighborhood] - workloadIDToProfileState maps.SafeMap[string, *objectcache.ProfileState] // Tracks profile state even if not in cache - containerIDToInfo maps.SafeMap[string, *ContainerInfo] - networkNeighborhoodToUserManagedIdentifier maps.SafeMap[string, string] // networkNeighborhoodName -> user-managed profile unique identifier - storageClient storage.ProfileClient - k8sObjectCache objectcache.K8sObjectCache - exporter exporters.Exporter // Exporter for sending tamper detection alerts - updateInterval time.Duration - updateInProgress bool // Flag to track if update is in progress - updateMutex sync.Mutex // Mutex to protect the flag - containerLocks *resourcelocks.ResourceLocks // Locks for each container to prevent concurrent modifications -} - -// NewNetworkNeighborhoodCache creates a new network neighborhood cache with periodic updates -func NewNetworkNeighborhoodCache(cfg config.Config, storageClient storage.ProfileClient, k8sObjectCache objectcache.K8sObjectCache, exporter exporters.Exporter) *NetworkNeighborhoodCacheImpl { - updateInterval := utils.AddJitter(cfg.ProfilesCacheRefreshRate, 10) // Add 10% jitter to avoid high load on the storage - - nnc := &NetworkNeighborhoodCacheImpl{ - cfg: cfg, - workloadIDToNetworkNeighborhood: maps.SafeMap[string, *v1beta1.NetworkNeighborhood]{}, - workloadIDToProfileState: maps.SafeMap[string, *objectcache.ProfileState]{}, - containerIDToInfo: maps.SafeMap[string, *ContainerInfo]{}, - networkNeighborhoodToUserManagedIdentifier: maps.SafeMap[string, string]{}, - storageClient: storageClient, - k8sObjectCache: k8sObjectCache, - exporter: exporter, - updateInterval: updateInterval, - containerLocks: resourcelocks.New(), - } - - return nnc -} - -// Start begins the periodic update process -func (nnc *NetworkNeighborhoodCacheImpl) Start(ctx context.Context) { - go nnc.periodicUpdate(ctx) -} - -// periodicUpdate periodically fetches and updates network neighborhoods from storage -func (nnc *NetworkNeighborhoodCacheImpl) periodicUpdate(ctx context.Context) { - ticker := time.NewTicker(nnc.updateInterval) - defer ticker.Stop() - - for { - select { - case <-ticker.C: - // Check if an update is already in progress - nnc.updateMutex.Lock() - if nnc.updateInProgress { - // Skip this update cycle - logger.L().Debug("skipping profile update: previous update still in progress") - nnc.updateMutex.Unlock() - continue - } - - // Set the flag and release the lock before the potentially long-running call - nnc.updateInProgress = true - nnc.updateMutex.Unlock() - - // Run the update directly - nnc.updateAllNetworkNeighborhoods(ctx) - - // Mark the update as complete - nnc.updateMutex.Lock() - nnc.updateInProgress = false - nnc.updateMutex.Unlock() - - case <-ctx.Done(): - logger.L().Info("NetworkNeighborhoodsCache periodic update stopped") - return - } - } -} - -// updateAllNetworkNeighborhoods fetches all network neighborhoods from storage and updates the cache -func (nnc *NetworkNeighborhoodCacheImpl) updateAllNetworkNeighborhoods(ctx context.Context) { - // Get unique namespaces from container info - namespaces := nnc.getNamespaces() - if len(namespaces) == 0 { - logger.L().Debug("no namespaces found in cache, skipping network neighborhood update") - return - } - - // Iterate over each namespace - for _, namespace := range namespaces { - // Get container IDs for this namespace - containerIDs := nnc.getContainerIDsForNamespace(namespace) - if len(containerIDs) == 0 { - logger.L().Debug("no containers found for namespace, skipping", - helpers.String("namespace", namespace)) - continue - } - - // Get network neighborhoods list for this namespace - var nnList *v1beta1.NetworkNeighborhoodList - continueToken := "" - for { - list, err := nnc.storageClient.ListNetworkNeighborhoods(namespace, int64(50), continueToken) - if err != nil { - logger.L().Error("failed to list network neighborhoods", - helpers.String("namespace", namespace), - helpers.Error(err)) - break - } - - if nnList == nil { - nnList = list - } else { - nnList.Items = append(nnList.Items, list.Items...) - } - - continueToken = list.Continue - if continueToken == "" { - break - } - } - - if nnList == nil { - continue - } - - // Process each network neighborhood - for _, nn := range nnList.Items { - // Handle user-managed network neighborhoods - if isUserManagedNN(&nn) { - nnc.handleUserManagedNetworkNeighborhood(&nn) - continue - } - - // Get the workload ID from network neighborhood - workloadID := nnc.wlidKey( - nn.Annotations[helpersv1.WlidMetadataKey], - nn.Labels[helpersv1.TemplateHashKey], - ) - if workloadID == "" { - continue - } - - // Update profile state regardless of whether we'll update the full profile - profileState := &objectcache.ProfileState{ - Completion: nn.Annotations[helpersv1.CompletionMetadataKey], - Status: nn.Annotations[helpersv1.StatusMetadataKey], - Name: nn.Name, - Error: nil, - } - nnc.workloadIDToProfileState.Set(workloadID, profileState) - - // Only consider completed network neighborhoods - if nn.Annotations[helpersv1.StatusMetadataKey] != helpersv1.Completed { - continue - } - - // Check if this workload ID is used by any container in this namespace - workloadIDInUse := false - hasNewContainer := false // Track if any container using this workload was seen from start - for _, containerID := range containerIDs { - if containerInfo, exists := nnc.containerIDToInfo.Load(containerID); exists && - containerInfo.WorkloadID == workloadID && - containerInfo.InstanceTemplateHash == nn.Labels[helpersv1.TemplateHashKey] { - workloadIDInUse = true - // If any container was seen from start, mark it - if containerInfo.SeenContainerFromTheStart { - hasNewContainer = true - } - } - } - - if !workloadIDInUse { - continue - } - - // Never overwrite a user-defined network neighborhood with an - // auto-learned one. Check if any container for this workload - // has a user-defined-network label. - if nnc.workloadHasUserDefinedNetwork(workloadID) { - continue - } - - // If we have a "new" container (seen from start) and the network neighborhood is partial, - // skip it - we don't want to use partial profiles for containers we're tracking from the start - if hasNewContainer && nn.Annotations[helpersv1.CompletionMetadataKey] == helpersv1.Partial { - logger.L().Debug("skipping partial network neighborhood for container seen from start", - helpers.String("workloadID", workloadID), - helpers.String("namespace", namespace)) - continue - } - - // Update the network neighborhood in the cache - if existingNN, exists := nnc.workloadIDToNetworkNeighborhood.Load(workloadID); exists { - // If the network neighborhood already exists and it's complete/completed, continue to the next one - if existingNN.Annotations[helpersv1.CompletionMetadataKey] == helpersv1.Full { - continue - } - - // If the new network neighborhood is not complete and we already have a completed/partial one, skip it - if nn.Annotations[helpersv1.CompletionMetadataKey] != helpersv1.Full { - continue - } - } - - // Fetch the network neighborhood from storage - fullNN, err := nnc.storageClient.GetNetworkNeighborhood(namespace, nn.Name) - if err != nil { - logger.L().Error("failed to get network neighborhood", - helpers.String("workloadID", workloadID), - helpers.String("namespace", namespace), - helpers.Error(err)) - profileState.Error = err - nnc.workloadIDToProfileState.Set(workloadID, profileState) - continue - } - - // Verify signature — always check signed NNs for tamper (R1016), - // enforcement mode only controls whether tampered NNs are loaded. - if err := nnc.verifyNetworkNeighborhood(fullNN, workloadID); err != nil { - profileState.Error = fmt.Errorf("signature verification failed: %w", err) - nnc.workloadIDToProfileState.Set(workloadID, profileState) - continue - } - - nnc.workloadIDToNetworkNeighborhood.Set(workloadID, fullNN) - logger.L().Debug("updated network neighborhood in cache", - helpers.String("workloadID", workloadID), - helpers.String("namespace", namespace), - helpers.String("status", nn.Annotations[helpersv1.StatusMetadataKey]), - helpers.String("completion", nn.Annotations[helpersv1.CompletionMetadataKey])) - } - } -} - -// handleUserManagedNetworkNeighborhood handles user-managed network neighborhoods -func (nnc *NetworkNeighborhoodCacheImpl) handleUserManagedNetworkNeighborhood(nn *v1beta1.NetworkNeighborhood) { - normalizedNNName := strings.TrimPrefix(nn.Name, helpersv1.UserNetworkNeighborhoodPrefix) - userManagedNNUniqueIdentifier := nn.ResourceVersion + string(nn.UID) - - // Create a unique tracking key for this user network neighborhood - nnKey := nnc.networkNeighborhoodKey(nn.Namespace, normalizedNNName) - - // Check if we've already processed this exact version of the user-managed network neighborhood - if storedIdentifier, exists := nnc.networkNeighborhoodToUserManagedIdentifier.Load(nnKey); exists && - storedIdentifier == userManagedNNUniqueIdentifier { - return - } - - // Find and collect the network neighborhood to merge - var toMerge struct { - wlid string - nn *v1beta1.NetworkNeighborhood - } - - nnc.workloadIDToNetworkNeighborhood.Range(func(wlid string, originalNN *v1beta1.NetworkNeighborhood) bool { - if originalNN.Name == normalizedNNName && originalNN.Namespace == nn.Namespace { - toMerge.wlid = wlid - toMerge.nn = originalNN - logger.L().Debug("found matching network neighborhood for user-managed network neighborhood", - helpers.String("workloadID", wlid), - helpers.String("namespace", originalNN.Namespace), - helpers.String("nnName", originalNN.Name)) - // Stop iteration - return false - } - return true - }) - - // If we didn't find a matching network neighborhood, skip merging - if toMerge.nn == nil { - return - } - - // Fetch the full user network neighborhood - fullUserNN, err := nnc.storageClient.GetNetworkNeighborhood(nn.Namespace, nn.Name) - if err != nil { - logger.L().Error("failed to get user-managed network neighborhood", - helpers.String("namespace", nn.Namespace), - helpers.String("nnName", nn.Name), - helpers.Error(err)) - return - } - - // Merge the user-managed network neighborhood with the normal network neighborhood - - // First, pull the original network neighborhood from the storage - originalNN, err := nnc.storageClient.GetNetworkNeighborhood(toMerge.nn.Namespace, toMerge.nn.Name) - if err != nil { - logger.L().Error("failed to get original network neighborhood", - helpers.String("namespace", toMerge.nn.Namespace), - helpers.String("nnName", toMerge.nn.Name), - helpers.Error(err)) - return - } - - // Verify signature on the original network neighborhood before merging - if err := nnc.verifyNetworkNeighborhood(originalNN, toMerge.wlid); err != nil { - profileState := &objectcache.ProfileState{ - Completion: originalNN.Annotations[helpersv1.CompletionMetadataKey], - Status: originalNN.Annotations[helpersv1.StatusMetadataKey], - Name: originalNN.Name, - Error: fmt.Errorf("signature verification failed: %w", err), - } - nnc.workloadIDToProfileState.Set(toMerge.wlid, profileState) - // Evict stale merged profile from cache on verification failure - nnc.workloadIDToNetworkNeighborhood.Delete(toMerge.wlid) - return - } - - // Verify signature on the user-managed network neighborhood before merging - if err := nnc.verifyNetworkNeighborhood(fullUserNN, toMerge.wlid); err != nil { - profileState := &objectcache.ProfileState{ - Completion: fullUserNN.Annotations[helpersv1.CompletionMetadataKey], - Status: fullUserNN.Annotations[helpersv1.StatusMetadataKey], - Name: fullUserNN.Name, - Error: fmt.Errorf("signature verification failed: %w", err), - } - nnc.workloadIDToProfileState.Set(toMerge.wlid, profileState) - // Restore cache to originalNN on user-managed verification failure - nnc.workloadIDToNetworkNeighborhood.Set(toMerge.wlid, originalNN) - return - } - - // Merge the network neighborhoods - mergedNN := nnc.performMerge(originalNN, fullUserNN) - - // Clear stale signature annotations after merge - delete(mergedNN.Annotations, signature.AnnotationSignature) - delete(mergedNN.Annotations, signature.AnnotationCertificate) - delete(mergedNN.Annotations, signature.AnnotationRekorBundle) - delete(mergedNN.Annotations, signature.AnnotationIssuer) - delete(mergedNN.Annotations, signature.AnnotationIdentity) - delete(mergedNN.Annotations, signature.AnnotationTimestamp) - - // Update the cache with the merged network neighborhood - nnc.workloadIDToNetworkNeighborhood.Set(toMerge.wlid, mergedNN) - // Update profile state for the merged profile - profileState := &objectcache.ProfileState{ - Completion: mergedNN.Annotations[helpersv1.CompletionMetadataKey], - Status: mergedNN.Annotations[helpersv1.StatusMetadataKey], - Name: mergedNN.Name, - Error: nil, - } - nnc.workloadIDToProfileState.Set(toMerge.wlid, profileState) - logger.L().Debug("merged user-managed network neighborhood with normal network neighborhood", - helpers.String("workloadID", toMerge.wlid), - helpers.String("namespace", nn.Namespace), - helpers.String("nnName", nn.Name)) - - // Record that we've processed this version of the network neighborhood - nnc.networkNeighborhoodToUserManagedIdentifier.Set(nnKey, userManagedNNUniqueIdentifier) -} - -// ContainerCallback handles container lifecycle events -func (nnc *NetworkNeighborhoodCacheImpl) ContainerCallback(notif containercollection.PubSubEvent) { - isHost := utils.IsHostContainer(notif.Container) - namespace := notif.Container.K8s.Namespace - if isHost { - namespace = "host" - } - switch notif.Type { - case containercollection.EventTypeAddContainer: - if !isHost && nnc.cfg.IgnoreContainer(namespace, notif.Container.K8s.PodName, notif.Container.K8s.PodLabels) { - return - } - container := notif.Container - if isHost { - containerCopy := *notif.Container - containerCopy.K8s.Namespace = namespace - container = &containerCopy - } - go nnc.addContainerWithTimeout(container) - case containercollection.EventTypeRemoveContainer: - if !isHost && nnc.cfg.IgnoreContainer(namespace, notif.Container.K8s.PodName, notif.Container.K8s.PodLabels) { - return - } - go nnc.deleteContainer(notif.Container.Runtime.ContainerID) - } -} - -// addContainerWithTimeout handles adding a container with a timeout to prevent hanging -func (nnc *NetworkNeighborhoodCacheImpl) addContainerWithTimeout(container *containercollection.Container) { - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) - defer cancel() - - done := make(chan error, 1) - go func() { - done <- nnc.addContainer(container, ctx) - }() - - select { - case err := <-done: - if err != nil { - logger.L().Error("failed to add container to the cache", helpers.Error(err)) - } - case <-ctx.Done(): - logger.L().Error("timeout while adding container to the cache", - helpers.String("containerID", container.Runtime.ContainerID), - helpers.String("containerName", container.Runtime.ContainerName), - helpers.String("podName", container.K8s.PodName), - helpers.String("namespace", container.K8s.Namespace)) - } -} - -// addContainer adds a container to the cache -func (nnc *NetworkNeighborhoodCacheImpl) addContainer(container *containercollection.Container, ctx context.Context) error { - containerID := container.Runtime.ContainerID - - return nnc.containerLocks.WithLockAndError(containerID, func() error { - // Get workload ID from shared data - sharedData, err := nnc.waitForSharedContainerData(containerID, ctx) - if err != nil { - logger.L().Error("failed to get shared data for container", - helpers.String("containerID", containerID), - helpers.Error(err)) - return err - } - - workloadID := nnc.wlidKey(sharedData.Wlid, sharedData.InstanceID.GetTemplateHash()) - if workloadID == "" { - logger.L().Debug("empty workloadID for container", helpers.String("containerID", containerID)) - return nil - } - - // If container restarts and profile is partial, delete it from cache - // This ensures we don't alert on activity we didn't see after restart - if existingNN, exists := nnc.workloadIDToNetworkNeighborhood.Load(workloadID); exists && !sharedData.PreRunningContainer { - if existingNN != nil && existingNN.Annotations != nil { - completion := existingNN.Annotations[helpersv1.CompletionMetadataKey] - if completion == helpersv1.Partial { - logger.L().Debug("deleting partial network neighborhood on container restart", - helpers.String("containerID", containerID), - helpers.String("workloadID", workloadID), - helpers.String("namespace", container.K8s.Namespace)) - - // Delete the network neighborhood from cache - nnKey := nnc.networkNeighborhoodKey(existingNN.Namespace, existingNN.Name) - nnc.networkNeighborhoodToUserManagedIdentifier.Delete(nnKey) - nnc.workloadIDToNetworkNeighborhood.Delete(workloadID) - } - } - } - - // Create container info - // Mark container as "seen from start" if it is not pre-running - containerInfo := &ContainerInfo{ - ContainerID: containerID, - WorkloadID: workloadID, - InstanceTemplateHash: sharedData.InstanceID.GetTemplateHash(), - Namespace: container.K8s.Namespace, - SeenContainerFromTheStart: !sharedData.PreRunningContainer, - UserDefinedNetwork: sharedData.UserDefinedNetwork, - } - - // Add to container info map - nnc.containerIDToInfo.Set(containerID, containerInfo) - - // If the container has a user-defined network neighborhood, load it - // directly into the cache — skip learning entirely for this workload. - if sharedData.UserDefinedNetwork != "" { - fullNN, err := nnc.storageClient.GetNetworkNeighborhood( - container.K8s.Namespace, sharedData.UserDefinedNetwork) - if err != nil { - logger.L().Error("failed to get user-defined network neighborhood", - helpers.String("containerID", containerID), - helpers.String("workloadID", workloadID), - helpers.String("namespace", container.K8s.Namespace), - helpers.String("nnName", sharedData.UserDefinedNetwork), - helpers.Error(err)) - profileState := &objectcache.ProfileState{ - Error: err, - } - nnc.workloadIDToProfileState.Set(workloadID, profileState) - return nil - } - - nnc.workloadIDToNetworkNeighborhood.Set(workloadID, fullNN) - profileState := &objectcache.ProfileState{ - Completion: helpersv1.Full, - Status: helpersv1.Completed, - Name: fullNN.Name, - Error: nil, - } - nnc.workloadIDToProfileState.Set(workloadID, profileState) - - logger.L().Debug("added user-defined network neighborhood to cache", - helpers.String("containerID", containerID), - helpers.String("workloadID", workloadID), - helpers.String("namespace", container.K8s.Namespace), - helpers.String("nnName", sharedData.UserDefinedNetwork)) - return nil - } - - // Create workload ID to state mapping - if _, exists := nnc.workloadIDToProfileState.Load(workloadID); !exists { - nnc.workloadIDToProfileState.Set(workloadID, &objectcache.ProfileState{ - Error: fmt.Errorf("waiting for profile update"), - }) - } - - logger.L().Debug("container added to cache", - helpers.String("containerID", containerID), - helpers.String("workloadID", workloadID), - helpers.String("namespace", container.K8s.Namespace)) - - return nil - }) -} - -// deleteContainer deletes a container from the cache -func (nnc *NetworkNeighborhoodCacheImpl) deleteContainer(containerID string) { - nnc.containerLocks.WithLock(containerID, func() { - // Get container info - containerInfo, exists := nnc.containerIDToInfo.Load(containerID) - if !exists { - logger.L().Debug("containerID not found in cache", helpers.String("containerID", containerID)) - return - } - - // Clean up container info - nnc.containerIDToInfo.Delete(containerID) - - // Check if any other container is using the same workload ID - workloadStillInUse := false - nnc.containerIDToInfo.Range(func(_ string, info *ContainerInfo) bool { - if info.WorkloadID == containerInfo.WorkloadID { - workloadStillInUse = true - return false // Stop iteration - } - return true // Continue iteration - }) - - // If no other container is using the same workload ID, delete it from the cache - if !workloadStillInUse { - if nn, exists := nnc.workloadIDToNetworkNeighborhood.Load(containerInfo.WorkloadID); exists { - // Remove any user managed identifiers related to this network neighborhood - nnKey := nnc.networkNeighborhoodKey(nn.Namespace, nn.Name) - nnc.networkNeighborhoodToUserManagedIdentifier.Delete(nnKey) - } - nnc.workloadIDToNetworkNeighborhood.Delete(containerInfo.WorkloadID) - nnc.workloadIDToProfileState.Delete(containerInfo.WorkloadID) - logger.L().Debug("deleted workloadID from cache", helpers.String("workloadID", containerInfo.WorkloadID)) - } - }) - - // Clean up the lock when done - call this outside the WithLock closure - nnc.containerLocks.ReleaseLock(containerID) -} - -// waitForSharedContainerData waits for shared container data to be available -func (nnc *NetworkNeighborhoodCacheImpl) waitForSharedContainerData(containerID string, ctx context.Context) (*objectcache.WatchedContainerData, error) { - return backoff.Retry(ctx, func() (*objectcache.WatchedContainerData, error) { - if sharedData := nnc.k8sObjectCache.GetSharedContainerData(containerID); sharedData != nil { - return sharedData, nil - } - return nil, fmt.Errorf("container %s not found in shared data", containerID) - }, backoff.WithBackOff(backoff.NewExponentialBackOff())) -} - -func (nnc *NetworkNeighborhoodCacheImpl) networkNeighborhoodKey(namespace, name string) string { - return fmt.Sprintf("%s/%s", namespace, name) -} - -func (nnc *NetworkNeighborhoodCacheImpl) wlidKey(wlid, templateHash string) string { - return fmt.Sprintf("%s/%s", wlid, templateHash) -} - -// GetNetworkNeighborhood gets the network neighborhood for a container -func (nnc *NetworkNeighborhoodCacheImpl) GetNetworkNeighborhood(containerID string) *v1beta1.NetworkNeighborhood { - // Get container info - if containerInfo, exists := nnc.containerIDToInfo.Load(containerID); exists { - workloadID := containerInfo.WorkloadID - if workloadID == "" { - return nil - } - - // Try to get network neighborhood from cache - if nn, exists := nnc.workloadIDToNetworkNeighborhood.Load(workloadID); exists { - if nn != nil { - return nn - } - } - } - - return nil -} - -// GetNetworkNeighborhoodState gets the profile state for a container -func (nnc *NetworkNeighborhoodCacheImpl) GetNetworkNeighborhoodState(containerID string) *objectcache.ProfileState { - // Get container info - containerInfo, exists := nnc.containerIDToInfo.Load(containerID) - if !exists { - return &objectcache.ProfileState{ - Error: fmt.Errorf("container %s not found in cache", containerID), - } - } - - workloadID := containerInfo.WorkloadID - if workloadID == "" { - return &objectcache.ProfileState{ - Error: fmt.Errorf("no workload ID for container %s", containerID), - } - } - - // Try to get profile state from cache - if profileState, exists := nnc.workloadIDToProfileState.Load(workloadID); exists { - if profileState != nil { - return profileState - } - return &objectcache.ProfileState{ - Error: fmt.Errorf("network neighborhood state is nil for workload %s", workloadID), - } - } - - return &objectcache.ProfileState{ - Error: fmt.Errorf("profile state not found for workload ID %s", workloadID), - } -} - -// performMerge merges a user-managed network neighborhood with a normal network neighborhood -func (nnc *NetworkNeighborhoodCacheImpl) performMerge(normalNN, userManagedNN *v1beta1.NetworkNeighborhood) *v1beta1.NetworkNeighborhood { - mergedNN := normalNN.DeepCopy() - - // Merge spec - mergedNN.Spec.Containers = nnc.mergeContainers(mergedNN.Spec.Containers, userManagedNN.Spec.Containers) - mergedNN.Spec.InitContainers = nnc.mergeContainers(mergedNN.Spec.InitContainers, userManagedNN.Spec.InitContainers) - mergedNN.Spec.EphemeralContainers = nnc.mergeContainers(mergedNN.Spec.EphemeralContainers, userManagedNN.Spec.EphemeralContainers) - - // Merge LabelSelector - if userManagedNN.Spec.LabelSelector.MatchLabels != nil { - if mergedNN.Spec.LabelSelector.MatchLabels == nil { - mergedNN.Spec.LabelSelector.MatchLabels = make(map[string]string) - } - for k, v := range userManagedNN.Spec.LabelSelector.MatchLabels { - mergedNN.Spec.LabelSelector.MatchLabels[k] = v - } - } - mergedNN.Spec.LabelSelector.MatchExpressions = append( - mergedNN.Spec.LabelSelector.MatchExpressions, - userManagedNN.Spec.LabelSelector.MatchExpressions..., - ) - - return mergedNN -} - -func (nnc *NetworkNeighborhoodCacheImpl) mergeContainers(normalContainers, userManagedContainers []v1beta1.NetworkNeighborhoodContainer) []v1beta1.NetworkNeighborhoodContainer { - if len(userManagedContainers) != len(normalContainers) { - // If the number of containers don't match, we can't merge - logger.L().Warning("NetworkNeighborhoodCacheImpl - failed to merge user-managed profile with base profile", - helpers.Int("normalContainers len", len(normalContainers)), - helpers.Int("userManagedContainers len", len(userManagedContainers)), - helpers.String("reason", "number of containers don't match")) - return normalContainers - } - - // Assuming the normalContainers are already in the correct Pod order - // We'll merge user containers at their corresponding positions - for i := range normalContainers { - for _, userContainer := range userManagedContainers { - if normalContainers[i].Name == userContainer.Name { - nnc.mergeContainer(&normalContainers[i], &userContainer) - break - } - } - } - return normalContainers -} - -func (nnc *NetworkNeighborhoodCacheImpl) mergeContainer(normalContainer, userContainer *v1beta1.NetworkNeighborhoodContainer) { - // Merge ingress rules - normalContainer.Ingress = nnc.mergeNetworkNeighbors(normalContainer.Ingress, userContainer.Ingress) - - // Merge egress rules - normalContainer.Egress = nnc.mergeNetworkNeighbors(normalContainer.Egress, userContainer.Egress) -} - -func (nnc *NetworkNeighborhoodCacheImpl) mergeNetworkNeighbors(normalNeighbors, userNeighbors []v1beta1.NetworkNeighbor) []v1beta1.NetworkNeighbor { - // Use map to track existing neighbors by identifier - neighborMap := make(map[string]int) - for i, neighbor := range normalNeighbors { - neighborMap[neighbor.Identifier] = i - } - - // Merge or append user neighbors - for _, userNeighbor := range userNeighbors { - if idx, exists := neighborMap[userNeighbor.Identifier]; exists { - // Merge existing neighbor - normalNeighbors[idx] = nnc.mergeNetworkNeighbor(normalNeighbors[idx], userNeighbor) - } else { - // Append new neighbor - normalNeighbors = append(normalNeighbors, userNeighbor) - } - } - - return normalNeighbors -} - -func (nnc *NetworkNeighborhoodCacheImpl) mergeNetworkNeighbor(normal, user v1beta1.NetworkNeighbor) v1beta1.NetworkNeighbor { - merged := normal.DeepCopy() - - // Merge DNS names (removing duplicates) - dnsNamesSet := make(map[string]struct{}) - for _, dns := range normal.DNSNames { - dnsNamesSet[dns] = struct{}{} - } - for _, dns := range user.DNSNames { - dnsNamesSet[dns] = struct{}{} - } - merged.DNSNames = make([]string, 0, len(dnsNamesSet)) - for dns := range dnsNamesSet { - merged.DNSNames = append(merged.DNSNames, dns) - } - - // Merge ports based on patchMergeKey (name) - merged.Ports = nnc.mergeNetworkPorts(merged.Ports, user.Ports) - - // Merge pod selector if provided - if user.PodSelector != nil { - if merged.PodSelector == nil { - merged.PodSelector = &metav1.LabelSelector{} - } - if user.PodSelector.MatchLabels != nil { - if merged.PodSelector.MatchLabels == nil { - merged.PodSelector.MatchLabels = make(map[string]string) - } - for k, v := range user.PodSelector.MatchLabels { - merged.PodSelector.MatchLabels[k] = v - } - } - merged.PodSelector.MatchExpressions = append( - merged.PodSelector.MatchExpressions, - user.PodSelector.MatchExpressions..., - ) - } - - // Merge namespace selector if provided - if user.NamespaceSelector != nil { - if merged.NamespaceSelector == nil { - merged.NamespaceSelector = &metav1.LabelSelector{} - } - if user.NamespaceSelector.MatchLabels != nil { - if merged.NamespaceSelector.MatchLabels == nil { - merged.NamespaceSelector.MatchLabels = make(map[string]string) - } - for k, v := range user.NamespaceSelector.MatchLabels { - merged.NamespaceSelector.MatchLabels[k] = v - } - } - merged.NamespaceSelector.MatchExpressions = append( - merged.NamespaceSelector.MatchExpressions, - user.NamespaceSelector.MatchExpressions..., - ) - } - - // Take the user's IP address if provided - if user.IPAddress != "" { - merged.IPAddress = user.IPAddress - } - - // Take the user's type if provided - if user.Type != "" { - merged.Type = user.Type - } - - return *merged -} - -func (nnc *NetworkNeighborhoodCacheImpl) mergeNetworkPorts(normalPorts, userPorts []v1beta1.NetworkPort) []v1beta1.NetworkPort { - // Use map to track existing ports by name (patchMergeKey) - portMap := make(map[string]int) - for i, port := range normalPorts { - portMap[port.Name] = i - } - - // Merge or append user ports - for _, userPort := range userPorts { - if idx, exists := portMap[userPort.Name]; exists { - // Update existing port - normalPorts[idx] = userPort - } else { - // Append new port - normalPorts = append(normalPorts, userPort) - } - } - - return normalPorts -} - -// workloadHasUserDefinedNetwork returns true if any container tracked for -// the given workloadID has a user-defined-network label set. -func (nnc *NetworkNeighborhoodCacheImpl) workloadHasUserDefinedNetwork(workloadID string) bool { - found := false - nnc.containerIDToInfo.Range(func(_ string, info *ContainerInfo) bool { - if info.WorkloadID == workloadID && info.UserDefinedNetwork != "" { - found = true - return false // stop iteration - } - return true - }) - return found -} - -// verifyNetworkNeighborhood verifies the NN signature. -// Always checks signed NNs for tamper (emits R1016 alert on tamper). -// When EnableSignatureVerification is true, also rejects tampered/unsigned NNs. -// Returns error if the NN should not be loaded, nil otherwise. -func (nnc *NetworkNeighborhoodCacheImpl) verifyNetworkNeighborhood(nn *v1beta1.NetworkNeighborhood, workloadID string) error { - adapter := profiles.NewNetworkNeighborhoodAdapter(nn) - - // Always check signed NNs for tamper, regardless of enforcement setting - if signature.IsSigned(adapter) { - if err := signature.VerifyObjectStrict(adapter); err != nil { - logger.L().Warning("network neighborhood signature verification failed (tamper detected)", - helpers.String("name", nn.Name), - helpers.String("namespace", nn.Namespace), - helpers.String("workloadID", workloadID), - helpers.Error(err)) - - // Emit R1016 tamper alert - nnc.emitTamperAlert(nn.Name, nn.Namespace, workloadID, "NetworkNeighborhood", err) - - if nnc.cfg.EnableSignatureVerification { - return err - } - // Enforcement off: allow loading despite tamper - return nil - } - return nil - } - - // Not signed - if nnc.cfg.EnableSignatureVerification { - return fmt.Errorf("network neighborhood is not signed") - } - return nil -} - -// emitTamperAlert sends an R1016 "Signed profile tampered" alert via the exporter. -func (nnc *NetworkNeighborhoodCacheImpl) emitTamperAlert(nnName, namespace, workloadID, objectKind string, verifyErr error) { - if nnc.exporter == nil { - return - } - - ruleFailure := &types.GenericRuleFailure{ - BaseRuntimeAlert: armotypes.BaseRuntimeAlert{ - AlertName: "Signed profile tampered", - InfectedPID: 1, - Severity: 10, - FixSuggestions: "Investigate who modified the " + objectKind + " '" + nnName + "' in namespace '" + namespace + "'. Re-sign the profile after verifying its contents.", - }, - AlertType: armotypes.AlertTypeRule, - RuntimeProcessDetails: armotypes.ProcessTree{ - ProcessTree: armotypes.Process{ - PID: 1, - Comm: "node-agent", - }, - }, - RuleAlert: armotypes.RuleAlert{ - RuleDescription: fmt.Sprintf("Signed %s '%s' in namespace '%s' has been tampered with: %v", objectKind, nnName, namespace, verifyErr), - }, - RuntimeAlertK8sDetails: armotypes.RuntimeAlertK8sDetails{ - Namespace: namespace, - }, - RuleID: "R1016", - } - - // Populate workload details from workloadID if available - ruleFailure.SetWorkloadDetails(extractWlidFromWorkloadID(workloadID)) - - nnc.exporter.SendRuleAlert(ruleFailure) -} - -// extractWlidFromWorkloadID extracts the wlid part from a "wlid/templateHash" key. -func extractWlidFromWorkloadID(workloadID string) string { - if idx := strings.LastIndex(workloadID, "/"); idx > 0 { - return workloadID[:idx] - } - return workloadID -} - -func isUserManagedNN(nn *v1beta1.NetworkNeighborhood) bool { - return nn.Annotations != nil && - nn.Annotations[helpersv1.ManagedByMetadataKey] == helpersv1.ManagedByUserValue && - strings.HasPrefix(nn.GetName(), helpersv1.UserNetworkNeighborhoodPrefix) -} - -// getNamespaces retrieves all unique namespaces from the container info cache -func (nnc *NetworkNeighborhoodCacheImpl) getNamespaces() []string { - namespaceSet := mapset.NewSet[string]() - nnc.containerIDToInfo.Range(func(_ string, info *ContainerInfo) bool { - namespaceSet.Add(info.Namespace) - return true - }) - return namespaceSet.ToSlice() -} - -// getContainerIDsForNamespace retrieves all container IDs for a given namespace -func (nnc *NetworkNeighborhoodCacheImpl) getContainerIDsForNamespace(namespace string) []string { - containerIDs := []string{} - nnc.containerIDToInfo.Range(func(containerID string, info *ContainerInfo) bool { - if info.Namespace == namespace { - containerIDs = append(containerIDs, containerID) - } - return true - }) - return containerIDs -} - -// Ensure NetworkNeighborhoodCacheImpl implements the NetworkNeighborhoodCache interface -var _ objectcache.NetworkNeighborhoodCache = (*NetworkNeighborhoodCacheImpl)(nil) diff --git a/pkg/objectcache/networkneighborhoodcache/networkneighborhoodcache_test.go b/pkg/objectcache/networkneighborhoodcache/networkneighborhoodcache_test.go deleted file mode 100644 index 47ea2097e5..0000000000 --- a/pkg/objectcache/networkneighborhoodcache/networkneighborhoodcache_test.go +++ /dev/null @@ -1,101 +0,0 @@ -package networkneighborhoodcache - -import ( - "context" - "fmt" - "testing" - - "github.com/kubescape/node-agent/pkg/config" - "github.com/kubescape/node-agent/pkg/storage" - "github.com/kubescape/storage/pkg/apis/softwarecomposition/v1beta1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" -) - -// SpyProfileClient for testing pagination -type SpyProfileClient struct { - storage.ProfileClient - NetworkNeighborhoods []v1beta1.NetworkNeighborhood - CallCount int -} - -func (m *SpyProfileClient) ListNetworkNeighborhoods(namespace string, limit int64, cont string) (*v1beta1.NetworkNeighborhoodList, error) { - m.CallCount++ - start := 0 - if cont != "" { - fmt.Sscanf(cont, "%d", &start) - } - - end := start + int(limit) - nextCont := "" - if end < len(m.NetworkNeighborhoods) { - nextCont = fmt.Sprintf("%d", end) - } else { - end = len(m.NetworkNeighborhoods) - } - - return &v1beta1.NetworkNeighborhoodList{ - ListMeta: metav1.ListMeta{ - Continue: nextCont, - }, - Items: m.NetworkNeighborhoods[start:end], - }, nil -} - -func (m *SpyProfileClient) ListApplicationProfiles(namespace string, limit int64, cont string) (*v1beta1.ApplicationProfileList, error) { - return &v1beta1.ApplicationProfileList{}, nil -} - -func (m *SpyProfileClient) GetNetworkNeighborhood(namespace, name string) (*v1beta1.NetworkNeighborhood, error) { - // Return empty object - return &v1beta1.NetworkNeighborhood{ - ObjectMeta: metav1.ObjectMeta{ - Name: name, - Namespace: namespace, - Annotations: map[string]string{ - "kubescape.io/completion": "complete", - "kubescape.io/status": "completed", - }, - }, - }, nil -} - -func TestPagination(t *testing.T) { - totalItems := 120 - items := make([]v1beta1.NetworkNeighborhood, totalItems) - for i := 0; i < totalItems; i++ { - items[i] = v1beta1.NetworkNeighborhood{ - ObjectMeta: metav1.ObjectMeta{ - Name: fmt.Sprintf("nn-%d", i), - Namespace: "default", - Annotations: map[string]string{ - "kubescape.io/completion": "complete", - "kubescape.io/status": "completed", - }, - Labels: map[string]string{ - "kubescape.io/wlid-template-hash": "hash", - }, - }, - } - } - - spy := &SpyProfileClient{NetworkNeighborhoods: items} - - cache := NewNetworkNeighborhoodCache(config.Config{}, spy, nil, nil) - - // Inject a container so that "default" namespace is processed. - cache.containerIDToInfo.Set("test-container", &ContainerInfo{ - Namespace: "default", - WorkloadID: "wlid", - }) - - // Call the private method - cache.updateAllNetworkNeighborhoods(context.Background()) - - // We expect 3 calls: - // 1. 0-50, returns continue="50" - // 2. 50-100, returns continue="100" - // 3. 100-120, returns continue="" - if spy.CallCount != 3 { - t.Errorf("Expected 3 calls to ListNetworkNeighborhoods, got %d", spy.CallCount) - } -} diff --git a/pkg/objectcache/networkneighborhoodcache_interface.go b/pkg/objectcache/networkneighborhoodcache_interface.go deleted file mode 100644 index fe617ced6d..0000000000 --- a/pkg/objectcache/networkneighborhoodcache_interface.go +++ /dev/null @@ -1,28 +0,0 @@ -package objectcache - -import ( - containercollection "github.com/inspektor-gadget/inspektor-gadget/pkg/container-collection" - "github.com/kubescape/storage/pkg/apis/softwarecomposition/v1beta1" -) - -type NetworkNeighborhoodCache interface { - GetNetworkNeighborhood(containerID string) *v1beta1.NetworkNeighborhood - GetNetworkNeighborhoodState(containerID string) *ProfileState - ContainerCallback(notif containercollection.PubSubEvent) -} - -var _ NetworkNeighborhoodCache = (*NetworkNeighborhoodCacheMock)(nil) - -type NetworkNeighborhoodCacheMock struct { -} - -func (nn *NetworkNeighborhoodCacheMock) GetNetworkNeighborhood(_ string) *v1beta1.NetworkNeighborhood { - return nil -} - -func (nn *NetworkNeighborhoodCacheMock) ContainerCallback(_ containercollection.PubSubEvent) { -} - -func (nn *NetworkNeighborhoodCacheMock) GetNetworkNeighborhoodState(_ string) *ProfileState { - return nil -} diff --git a/pkg/objectcache/objectcache_interface.go b/pkg/objectcache/objectcache_interface.go index 8621b0b84e..ce89ff12fe 100644 --- a/pkg/objectcache/objectcache_interface.go +++ b/pkg/objectcache/objectcache_interface.go @@ -2,8 +2,7 @@ package objectcache type ObjectCache interface { K8sObjectCache() K8sObjectCache - ApplicationProfileCache() ApplicationProfileCache - NetworkNeighborhoodCache() NetworkNeighborhoodCache + ContainerProfileCache() ContainerProfileCache DnsCache() DnsCache } @@ -19,11 +18,8 @@ func (om *ObjectCacheMock) K8sObjectCache() K8sObjectCache { return &K8sObjectCacheMock{} } -func (om *ObjectCacheMock) ApplicationProfileCache() ApplicationProfileCache { - return &ApplicationProfileCacheMock{} -} -func (om *ObjectCacheMock) NetworkNeighborhoodCache() NetworkNeighborhoodCache { - return &NetworkNeighborhoodCacheMock{} +func (om *ObjectCacheMock) ContainerProfileCache() ContainerProfileCache { + return &ContainerProfileCacheMock{} } func (om *ObjectCacheMock) DnsCache() DnsCache { diff --git a/pkg/objectcache/shared_container_data.go b/pkg/objectcache/shared_container_data.go index a1e8cd76c4..606ed3bd21 100644 --- a/pkg/objectcache/shared_container_data.go +++ b/pkg/objectcache/shared_container_data.go @@ -88,6 +88,7 @@ type WatchedContainerData struct { UserDefinedProfile string UserDefinedNetwork string LabelOverrides map[string]string // optional label overrides applied after GetLabels() + LearningPeriod time.Duration } type ContainerInfo struct { @@ -96,31 +97,19 @@ type ContainerInfo struct { ImageID string } +func formatDuration(d time.Duration) string { + s := d.String() + s = strings.Replace(s, "m0s", "m", 1) + s = strings.Replace(s, "h0m", "h", 1) + return s +} + func GetLabels(cloudMetadata *armotypes.CloudMetadata, watchedContainer *WatchedContainerData, stripContainer bool) map[string]string { labels := watchedContainer.InstanceID.GetLabels() - for i := range labels { - if labels[i] == "" || (stripContainer && i == helpersv1.ContainerNameMetadataKey) { - delete(labels, i) - continue - } - if errs := content.IsLabelValue(labels[i]); len(errs) != 0 { - logger.L().Debug("GetLabels - label is not valid", helpers.String("label", labels[i])) - for j := range errs { - logger.L().Debug("GetLabels - label err description", helpers.String("Err: ", errs[j])) - } - delete(labels, i) - } - } + labels[helpersv1.LearningPeriodMetadataKey] = formatDuration(watchedContainer.LearningPeriod) // Apply label overrides for k, v := range watchedContainer.LabelOverrides { - if v == "" { - delete(labels, k) - } else if errs := content.IsLabelValue(v); len(errs) != 0 { - logger.L().Warning("GetLabels - label override value is not valid, skipping", helpers.String("key", k), helpers.String("value", v)) - delete(labels, k) - } else { - labels[k] = v - } + labels[k] = v } if watchedContainer.ParentResourceVersion != "" { labels[helpersv1.ResourceVersionMetadataKey] = watchedContainer.ParentResourceVersion @@ -140,6 +129,20 @@ func GetLabels(cloudMetadata *armotypes.CloudMetadata, watchedContainer *Watched labels[helpersv1.RegionMetadataKey] = region } } + // Sanitize labels + for i := range labels { + if labels[i] == "" || (stripContainer && i == helpersv1.ContainerNameMetadataKey) { + delete(labels, i) + continue + } + if errs := content.IsLabelValue(labels[i]); len(errs) != 0 { + logger.L().Debug("GetLabels - label is not valid", helpers.String("label", labels[i])) + for j := range errs { + logger.L().Debug("GetLabels - label err description", helpers.String("Err: ", errs[j])) + } + delete(labels, i) + } + } return labels } diff --git a/pkg/objectcache/shared_container_data_test.go b/pkg/objectcache/shared_container_data_test.go index 63eb1983c3..ff1cd4752c 100644 --- a/pkg/objectcache/shared_container_data_test.go +++ b/pkg/objectcache/shared_container_data_test.go @@ -2,6 +2,7 @@ package objectcache import ( "testing" + "time" "github.com/kubescape/k8s-interface/instanceidhandler/v1" "github.com/stretchr/testify/assert" @@ -51,6 +52,7 @@ func Test_GetLabels(t *testing.T) { "kubescape.io/workload-api-version": "v1", "kubescape.io/workload-container-name": "redis", "kubescape.io/workload-kind": "Deployment", + "kubescape.io/learning-period": "0s", "kubescape.io/workload-name": "redis", "kubescape.io/workload-namespace": "aaa", }, @@ -67,6 +69,7 @@ func Test_GetLabels(t *testing.T) { want: map[string]string{ "kubescape.io/workload-api-version": "v1", "kubescape.io/workload-kind": "Deployment", + "kubescape.io/learning-period": "0s", "kubescape.io/workload-name": "redis", "kubescape.io/workload-namespace": "aaa", }, @@ -79,3 +82,36 @@ func Test_GetLabels(t *testing.T) { }) } } + +func Test_formatDuration(t *testing.T) { + tests := []struct { + d time.Duration + want string + }{ + { + d: 5 * time.Minute, + want: "5m", + }, + { + d: 1*time.Hour + 30*time.Minute, + want: "1h30m", + }, + { + d: 45 * time.Second, + want: "45s", + }, + { + d: 1*time.Hour + 30*time.Second, + want: "1h30s", + }, + { + d: 1 * time.Hour, + want: "1h", + }, + } + for _, tt := range tests { + t.Run(tt.d.String(), func(t *testing.T) { + assert.Equal(t, tt.want, formatDuration(tt.d)) + }) + } +} diff --git a/pkg/objectcache/v1/mock.go b/pkg/objectcache/v1/mock.go index c6cdeeb945..98c41e0db3 100644 --- a/pkg/objectcache/v1/mock.go +++ b/pkg/objectcache/v1/mock.go @@ -2,24 +2,40 @@ package objectcache import ( "context" + "errors" corev1 "k8s.io/api/core/v1" "github.com/goradd/maps" containercollection "github.com/inspektor-gadget/inspektor-gadget/pkg/container-collection" "github.com/kubescape/node-agent/pkg/objectcache" - "github.com/kubescape/node-agent/pkg/objectcache/applicationprofilecache/callstackcache" + "github.com/kubescape/node-agent/pkg/objectcache/callstackcache" "github.com/kubescape/node-agent/pkg/watcher" "github.com/kubescape/storage/pkg/apis/softwarecomposition/v1beta1" "k8s.io/apimachinery/pkg/runtime" ) -// RuleObjectCacheMock implementation as provided +// RuleObjectCacheMock is a test double for RuleObjectCache. +// +// Setter partition contract — SetApplicationProfile and SetNetworkNeighborhood +// both write into cpByContainerName entries but own non-overlapping fields: +// +// SetApplicationProfile → Architectures, Capabilities, Execs, Opens, Syscalls, +// SeccompProfile, Endpoints, ImageID, ImageTag, +// PolicyByRuleId, IdentifiedCallStacks +// SetNetworkNeighborhood → LabelSelector, Ingress, Egress +// +// Calling both setters produces a fully-populated ContainerProfile with no +// field conflict. Both setters apply a first-container-wins rule for r.cp +// (backward-compat pointer for single-container tests); the per-container map +// cpByContainerName is authoritative for multi-container tests. type RuleObjectCacheMock struct { profile *v1beta1.ApplicationProfile podSpec *corev1.PodSpec podStatus *corev1.PodStatus nn *v1beta1.NetworkNeighborhood + cp *v1beta1.ContainerProfile + cpByContainerName map[string]*v1beta1.ContainerProfile dnsCache map[string]string ContainerIDToSharedData *maps.SafeMap[string, *objectcache.WatchedContainerData] } @@ -34,9 +50,78 @@ func (r *RuleObjectCacheMock) GetCallStackSearchTree(string) *callstackcache.Cal func (r *RuleObjectCacheMock) SetApplicationProfile(profile *v1beta1.ApplicationProfile) { r.profile = profile + if profile == nil { + return + } + if r.cpByContainerName == nil { + r.cpByContainerName = make(map[string]*v1beta1.ContainerProfile) + } + apply := func(c *v1beta1.ApplicationProfileContainer) { + cp, ok := r.cpByContainerName[c.Name] + if !ok { + cp = &v1beta1.ContainerProfile{} + r.cpByContainerName[c.Name] = cp + } + cp.Spec.Architectures = profile.Spec.Architectures + cp.Spec.Capabilities = c.Capabilities + cp.Spec.Execs = c.Execs + cp.Spec.Opens = c.Opens + cp.Spec.Syscalls = c.Syscalls + cp.Spec.SeccompProfile = c.SeccompProfile + cp.Spec.Endpoints = c.Endpoints + cp.Spec.ImageID = c.ImageID + cp.Spec.ImageTag = c.ImageTag + cp.Spec.PolicyByRuleId = c.PolicyByRuleId + cp.Spec.IdentifiedCallStacks = c.IdentifiedCallStacks + } + for i := range profile.Spec.Containers { + apply(&profile.Spec.Containers[i]) + } + for i := range profile.Spec.InitContainers { + apply(&profile.Spec.InitContainers[i]) + } + for i := range profile.Spec.EphemeralContainers { + apply(&profile.Spec.EphemeralContainers[i]) + } + // r.cp = first container's entry (backward compat for single-container tests). + switch { + case len(profile.Spec.Containers) > 0: + r.cp = r.cpByContainerName[profile.Spec.Containers[0].Name] + case len(profile.Spec.InitContainers) > 0: + r.cp = r.cpByContainerName[profile.Spec.InitContainers[0].Name] + case len(profile.Spec.EphemeralContainers) > 0: + r.cp = r.cpByContainerName[profile.Spec.EphemeralContainers[0].Name] + } } -func (r *RuleObjectCacheMock) ApplicationProfileCache() objectcache.ApplicationProfileCache { +func (r *RuleObjectCacheMock) GetContainerProfile(containerID string) *v1beta1.ContainerProfile { + if r.ContainerIDToSharedData != nil && containerID != "" { + data, ok := r.ContainerIDToSharedData.Load(containerID) + if !ok { + return nil + } + // Resolve the per-container profile via the registered InstanceID so + // multi-container tests get the correct container's profile. + if data != nil && data.InstanceID != nil { + if cp, found := r.cpByContainerName[data.InstanceID.GetContainerName()]; found { + return cp + } + } + } + return r.cp +} + +func (r *RuleObjectCacheMock) SetContainerProfile(cp *v1beta1.ContainerProfile) { + r.cp = cp +} + +func (r *RuleObjectCacheMock) GetContainerProfileState(_ string) *objectcache.ProfileState { + return &objectcache.ProfileState{Error: errors.New("mock: profile not found")} +} + +func (r *RuleObjectCacheMock) Start(_ context.Context) {} + +func (r *RuleObjectCacheMock) ContainerProfileCache() objectcache.ContainerProfileCache { return r } @@ -87,16 +172,46 @@ func (r *RuleObjectCacheMock) K8sObjectCache() objectcache.K8sObjectCache { return r } -func (r *RuleObjectCacheMock) NetworkNeighborhoodCache() objectcache.NetworkNeighborhoodCache { - return r -} - func (r *RuleObjectCacheMock) GetNetworkNeighborhood(string) *v1beta1.NetworkNeighborhood { return r.nn } func (r *RuleObjectCacheMock) SetNetworkNeighborhood(nn *v1beta1.NetworkNeighborhood) { r.nn = nn + if nn == nil { + return + } + if r.cpByContainerName == nil { + r.cpByContainerName = make(map[string]*v1beta1.ContainerProfile) + } + apply := func(c *v1beta1.NetworkNeighborhoodContainer) { + cp, ok := r.cpByContainerName[c.Name] + if !ok { + cp = &v1beta1.ContainerProfile{} + r.cpByContainerName[c.Name] = cp + } + cp.Spec.LabelSelector = nn.Spec.LabelSelector + cp.Spec.Ingress = c.Ingress + cp.Spec.Egress = c.Egress + } + for i := range nn.Spec.Containers { + apply(&nn.Spec.Containers[i]) + } + for i := range nn.Spec.InitContainers { + apply(&nn.Spec.InitContainers[i]) + } + for i := range nn.Spec.EphemeralContainers { + apply(&nn.Spec.EphemeralContainers[i]) + } + // r.cp = first container's entry (backward compat for single-container tests). + switch { + case len(nn.Spec.Containers) > 0: + r.cp = r.cpByContainerName[nn.Spec.Containers[0].Name] + case len(nn.Spec.InitContainers) > 0: + r.cp = r.cpByContainerName[nn.Spec.InitContainers[0].Name] + case len(nn.Spec.EphemeralContainers) > 0: + r.cp = r.cpByContainerName[nn.Spec.EphemeralContainers[0].Name] + } } func (r *RuleObjectCacheMock) DnsCache() objectcache.DnsCache { diff --git a/pkg/objectcache/v1/objectcache.go b/pkg/objectcache/v1/objectcache.go index 9986077ee6..c1820a909e 100644 --- a/pkg/objectcache/v1/objectcache.go +++ b/pkg/objectcache/v1/objectcache.go @@ -8,16 +8,14 @@ var _ objectcache.ObjectCache = (*ObjectCacheImpl)(nil) type ObjectCacheImpl struct { k objectcache.K8sObjectCache - ap objectcache.ApplicationProfileCache - np objectcache.NetworkNeighborhoodCache + cp objectcache.ContainerProfileCache dc objectcache.DnsCache } -func NewObjectCache(k objectcache.K8sObjectCache, ap objectcache.ApplicationProfileCache, np objectcache.NetworkNeighborhoodCache, dc objectcache.DnsCache) *ObjectCacheImpl { +func NewObjectCache(k objectcache.K8sObjectCache, cp objectcache.ContainerProfileCache, dc objectcache.DnsCache) *ObjectCacheImpl { return &ObjectCacheImpl{ k: k, - ap: ap, - np: np, + cp: cp, dc: dc, } } @@ -26,11 +24,8 @@ func (o *ObjectCacheImpl) K8sObjectCache() objectcache.K8sObjectCache { return o.k } -func (o *ObjectCacheImpl) ApplicationProfileCache() objectcache.ApplicationProfileCache { - return o.ap -} -func (o *ObjectCacheImpl) NetworkNeighborhoodCache() objectcache.NetworkNeighborhoodCache { - return o.np +func (o *ObjectCacheImpl) ContainerProfileCache() objectcache.ContainerProfileCache { + return o.cp } func (o *ObjectCacheImpl) DnsCache() objectcache.DnsCache { diff --git a/pkg/objectcache/v1/objectcache_test.go b/pkg/objectcache/v1/objectcache_test.go index 207722ea5a..6af7e69c5e 100644 --- a/pkg/objectcache/v1/objectcache_test.go +++ b/pkg/objectcache/v1/objectcache_test.go @@ -10,18 +10,12 @@ import ( func TestK8sObjectCache(t *testing.T) { k := &objectcache.K8sObjectCacheMock{} - k8sObjectCache := NewObjectCache(k, nil, nil, nil) + k8sObjectCache := NewObjectCache(k, nil, nil) assert.NotNil(t, k8sObjectCache.K8sObjectCache()) } -func TestApplicationProfileCache(t *testing.T) { - ap := &objectcache.ApplicationProfileCacheMock{} - k8sObjectCache := NewObjectCache(nil, ap, nil, nil) - assert.NotNil(t, k8sObjectCache.ApplicationProfileCache()) -} - -func TestNetworkNeighborhoodCache(t *testing.T) { - nn := &objectcache.NetworkNeighborhoodCacheMock{} - k8sObjectCache := NewObjectCache(nil, nil, nn, nil) - assert.NotNil(t, k8sObjectCache.NetworkNeighborhoodCache()) +func TestContainerProfileCache(t *testing.T) { + cp := &objectcache.ContainerProfileCacheMock{} + k8sObjectCache := NewObjectCache(nil, cp, nil) + assert.NotNil(t, k8sObjectCache.ContainerProfileCache()) } diff --git a/pkg/rulemanager/cel/libraries/applicationprofile/capability.go b/pkg/rulemanager/cel/libraries/applicationprofile/capability.go index 5f3c09f217..13cbc0866c 100644 --- a/pkg/rulemanager/cel/libraries/applicationprofile/capability.go +++ b/pkg/rulemanager/cel/libraries/applicationprofile/capability.go @@ -23,12 +23,12 @@ func (l *apLibrary) wasCapabilityUsed(containerID, capabilityName ref.Val) ref.V return types.MaybeNoSuchOverloadErr(capabilityName) } - container, _, err := profilehelper.GetContainerApplicationProfile(l.objectCache, containerIDStr) + cp, _, err := profilehelper.GetContainerProfile(l.objectCache, containerIDStr) if err != nil { return cache.NewProfileNotAvailableErr("%v", err) } - if slices.Contains(container.Capabilities, capabilityNameStr) { + if slices.Contains(cp.Spec.Capabilities, capabilityNameStr) { return types.Bool(true) } diff --git a/pkg/rulemanager/cel/libraries/applicationprofile/exec.go b/pkg/rulemanager/cel/libraries/applicationprofile/exec.go index d7a16d0908..25b92f2366 100644 --- a/pkg/rulemanager/cel/libraries/applicationprofile/exec.go +++ b/pkg/rulemanager/cel/libraries/applicationprofile/exec.go @@ -32,14 +32,14 @@ func (l *apLibrary) wasExecuted(containerID, path ref.Val) ref.Val { return types.Bool(true) } - container, _, err := profilehelper.GetContainerApplicationProfile(l.objectCache, containerIDStr) + cp, _, err := profilehelper.GetContainerProfile(l.objectCache, containerIDStr) if err != nil { // Return a special error that will NOT be cached, allowing retry when profile becomes available. // The caller should convert this to false after the cache layer. return cache.NewProfileNotAvailableErr("%v", err) } - for _, exec := range container.Execs { + for _, exec := range cp.Spec.Execs { if exec.Path == pathStr { return types.Bool(true) } @@ -77,14 +77,14 @@ func (l *apLibrary) wasExecutedWithArgs(containerID, path, args ref.Val) ref.Val return types.Bool(true) } - container, _, err := profilehelper.GetContainerApplicationProfile(l.objectCache, containerIDStr) + cp, _, err := profilehelper.GetContainerProfile(l.objectCache, containerIDStr) if err != nil { // Return a special error that will NOT be cached, allowing retry when profile becomes available. // The caller should convert this to false after the cache layer. return cache.NewProfileNotAvailableErr("%v", err) } - for _, exec := range container.Execs { + for _, exec := range cp.Spec.Execs { if exec.Path == pathStr { if slices.Compare(exec.Args, celArgs) == 0 { return types.Bool(true) diff --git a/pkg/rulemanager/cel/libraries/applicationprofile/http.go b/pkg/rulemanager/cel/libraries/applicationprofile/http.go index ef7132e29c..fe91609a55 100644 --- a/pkg/rulemanager/cel/libraries/applicationprofile/http.go +++ b/pkg/rulemanager/cel/libraries/applicationprofile/http.go @@ -28,12 +28,12 @@ func (l *apLibrary) wasEndpointAccessed(containerID, endpoint ref.Val) ref.Val { return types.MaybeNoSuchOverloadErr(endpoint) } - container, _, err := profilehelper.GetContainerApplicationProfile(l.objectCache, containerIDStr) + cp, _, err := profilehelper.GetContainerProfile(l.objectCache, containerIDStr) if err != nil { return cache.NewProfileNotAvailableErr("%v", err) } - for _, ep := range container.Endpoints { + for _, ep := range cp.Spec.Endpoints { if dynamicpathdetector.CompareDynamic(ep.Endpoint, endpointStr) { return types.Bool(true) } @@ -61,12 +61,12 @@ func (l *apLibrary) wasEndpointAccessedWithMethod(containerID, endpoint, method return types.MaybeNoSuchOverloadErr(method) } - container, _, err := profilehelper.GetContainerApplicationProfile(l.objectCache, containerIDStr) + cp, _, err := profilehelper.GetContainerProfile(l.objectCache, containerIDStr) if err != nil { return cache.NewProfileNotAvailableErr("%v", err) } - for _, ep := range container.Endpoints { + for _, ep := range cp.Spec.Endpoints { if dynamicpathdetector.CompareDynamic(ep.Endpoint, endpointStr) { if slices.Contains(ep.Methods, methodStr) { return types.Bool(true) @@ -97,12 +97,12 @@ func (l *apLibrary) wasEndpointAccessedWithMethods(containerID, endpoint, method return types.NewErr("failed to parse methods: %v", err) } - container, _, err := profilehelper.GetContainerApplicationProfile(l.objectCache, containerIDStr) + cp, _, err := profilehelper.GetContainerProfile(l.objectCache, containerIDStr) if err != nil { return cache.NewProfileNotAvailableErr("%v", err) } - for _, ep := range container.Endpoints { + for _, ep := range cp.Spec.Endpoints { if dynamicpathdetector.CompareDynamic(ep.Endpoint, endpointStr) { for _, method := range celMethods { if slices.Contains(ep.Methods, method) { @@ -130,12 +130,12 @@ func (l *apLibrary) wasEndpointAccessedWithPrefix(containerID, prefix ref.Val) r return types.MaybeNoSuchOverloadErr(prefix) } - container, _, err := profilehelper.GetContainerApplicationProfile(l.objectCache, containerIDStr) + cp, _, err := profilehelper.GetContainerProfile(l.objectCache, containerIDStr) if err != nil { return cache.NewProfileNotAvailableErr("%v", err) } - for _, ep := range container.Endpoints { + for _, ep := range cp.Spec.Endpoints { if strings.HasPrefix(ep.Endpoint, prefixStr) { return types.Bool(true) } @@ -159,12 +159,12 @@ func (l *apLibrary) wasEndpointAccessedWithSuffix(containerID, suffix ref.Val) r return types.MaybeNoSuchOverloadErr(suffix) } - container, _, err := profilehelper.GetContainerApplicationProfile(l.objectCache, containerIDStr) + cp, _, err := profilehelper.GetContainerProfile(l.objectCache, containerIDStr) if err != nil { return cache.NewProfileNotAvailableErr("%v", err) } - for _, ep := range container.Endpoints { + for _, ep := range cp.Spec.Endpoints { if strings.HasSuffix(ep.Endpoint, suffixStr) { return types.Bool(true) } @@ -189,12 +189,12 @@ func (l *apLibrary) wasHostAccessed(containerID, host ref.Val) ref.Val { } // Check HTTP endpoints for host access - container, _, err := profilehelper.GetContainerApplicationProfile(l.objectCache, containerIDStr) + cp, _, err := profilehelper.GetContainerProfile(l.objectCache, containerIDStr) if err != nil { return cache.NewProfileNotAvailableErr("%v", err) } - for _, ep := range container.Endpoints { + for _, ep := range cp.Spec.Endpoints { // Parse the endpoint URL to extract host if parsedURL, err := url.Parse(ep.Endpoint); err == nil && parsedURL.Host != "" { if parsedURL.Host == hostStr || parsedURL.Hostname() == hostStr { diff --git a/pkg/rulemanager/cel/libraries/applicationprofile/open.go b/pkg/rulemanager/cel/libraries/applicationprofile/open.go index fc584e6fcb..63d8f604a4 100644 --- a/pkg/rulemanager/cel/libraries/applicationprofile/open.go +++ b/pkg/rulemanager/cel/libraries/applicationprofile/open.go @@ -25,12 +25,12 @@ func (l *apLibrary) wasPathOpened(containerID, path ref.Val) ref.Val { return types.MaybeNoSuchOverloadErr(path) } - container, _, err := profilehelper.GetContainerApplicationProfile(l.objectCache, containerIDStr) + cp, _, err := profilehelper.GetContainerProfile(l.objectCache, containerIDStr) if err != nil { return cache.NewProfileNotAvailableErr("%v", err) } - for _, open := range container.Opens { + for _, open := range cp.Spec.Opens { if dynamicpathdetector.CompareDynamic(open.Path, pathStr) { return types.Bool(true) } @@ -59,12 +59,12 @@ func (l *apLibrary) wasPathOpenedWithFlags(containerID, path, flags ref.Val) ref return types.NewErr("failed to parse flags: %v", err) } - container, _, err := profilehelper.GetContainerApplicationProfile(l.objectCache, containerIDStr) + cp, _, err := profilehelper.GetContainerProfile(l.objectCache, containerIDStr) if err != nil { return cache.NewProfileNotAvailableErr("%v", err) } - for _, open := range container.Opens { + for _, open := range cp.Spec.Opens { if dynamicpathdetector.CompareDynamic(open.Path, pathStr) { if compareOpenFlags(celFlags, open.Flags) { return types.Bool(true) @@ -89,12 +89,12 @@ func (l *apLibrary) wasPathOpenedWithSuffix(containerID, suffix ref.Val) ref.Val return types.MaybeNoSuchOverloadErr(suffix) } - container, _, err := profilehelper.GetContainerApplicationProfile(l.objectCache, containerIDStr) + cp, _, err := profilehelper.GetContainerProfile(l.objectCache, containerIDStr) if err != nil { return cache.NewProfileNotAvailableErr("%v", err) } - for _, open := range container.Opens { + for _, open := range cp.Spec.Opens { if strings.HasSuffix(open.Path, suffixStr) { return types.Bool(true) } @@ -117,12 +117,12 @@ func (l *apLibrary) wasPathOpenedWithPrefix(containerID, prefix ref.Val) ref.Val return types.MaybeNoSuchOverloadErr(prefix) } - container, _, err := profilehelper.GetContainerApplicationProfile(l.objectCache, containerIDStr) + cp, _, err := profilehelper.GetContainerProfile(l.objectCache, containerIDStr) if err != nil { return cache.NewProfileNotAvailableErr("%v", err) } - for _, open := range container.Opens { + for _, open := range cp.Spec.Opens { if strings.HasPrefix(open.Path, prefixStr) { return types.Bool(true) } diff --git a/pkg/rulemanager/cel/libraries/applicationprofile/syscall.go b/pkg/rulemanager/cel/libraries/applicationprofile/syscall.go index 7a26aa1846..7383aec5ba 100644 --- a/pkg/rulemanager/cel/libraries/applicationprofile/syscall.go +++ b/pkg/rulemanager/cel/libraries/applicationprofile/syscall.go @@ -23,12 +23,12 @@ func (l *apLibrary) wasSyscallUsed(containerID, syscallName ref.Val) ref.Val { return types.MaybeNoSuchOverloadErr(syscallName) } - container, _, err := profilehelper.GetContainerApplicationProfile(l.objectCache, containerIDStr) + cp, _, err := profilehelper.GetContainerProfile(l.objectCache, containerIDStr) if err != nil { return cache.NewProfileNotAvailableErr("%v", err) } - if slices.Contains(container.Syscalls, syscallNameStr) { + if slices.Contains(cp.Spec.Syscalls, syscallNameStr) { return types.Bool(true) } diff --git a/pkg/rulemanager/cel/libraries/k8s/k8s_test.go b/pkg/rulemanager/cel/libraries/k8s/k8s_test.go index e8001c2a8b..039c9fbeb4 100644 --- a/pkg/rulemanager/cel/libraries/k8s/k8s_test.go +++ b/pkg/rulemanager/cel/libraries/k8s/k8s_test.go @@ -61,7 +61,7 @@ func TestK8sLibrary(t *testing.T) { // Add the pod to the cache directly k8sObjCache.AddHandler(context.Background(), testPod) - objectCache := objectcache.NewObjectCache(k8sObjCache, nil, nil, nil) + objectCache := objectcache.NewObjectCache(k8sObjCache, nil, nil) env, err := cel.NewEnv( cel.Variable("event", cel.AnyType), K8s(objectCache.K8sObjectCache(), config.Config{}), @@ -134,7 +134,7 @@ func TestK8sLibraryGetContainerByName(t *testing.T) { // Add the pod to the cache directly k8sObjCache.AddHandler(context.Background(), testPod) - objectCache := objectcache.NewObjectCache(k8sObjCache, nil, nil, nil) + objectCache := objectcache.NewObjectCache(k8sObjCache, nil, nil) env, err := cel.NewEnv( cel.Variable("event", cel.AnyType), K8s(objectCache.K8sObjectCache(), config.Config{}), diff --git a/pkg/rulemanager/cel/libraries/networkneighborhood/network.go b/pkg/rulemanager/cel/libraries/networkneighborhood/network.go index 4fb334f7f1..0449ebf962 100644 --- a/pkg/rulemanager/cel/libraries/networkneighborhood/network.go +++ b/pkg/rulemanager/cel/libraries/networkneighborhood/network.go @@ -24,12 +24,12 @@ func (l *nnLibrary) wasAddressInEgress(containerID, address ref.Val) ref.Val { return types.MaybeNoSuchOverloadErr(address) } - container, err := profilehelper.GetContainerNetworkNeighborhood(l.objectCache, containerIDStr) + cp, _, err := profilehelper.GetContainerProfile(l.objectCache, containerIDStr) if err != nil { return cache.NewProfileNotAvailableErr("%v", err) } - for _, egress := range container.Egress { + for _, egress := range cp.Spec.Egress { if egress.IPAddress == addressStr { return types.Bool(true) } @@ -52,12 +52,12 @@ func (l *nnLibrary) wasAddressInIngress(containerID, address ref.Val) ref.Val { return types.MaybeNoSuchOverloadErr(address) } - container, err := profilehelper.GetContainerNetworkNeighborhood(l.objectCache, containerIDStr) + cp, _, err := profilehelper.GetContainerProfile(l.objectCache, containerIDStr) if err != nil { return cache.NewProfileNotAvailableErr("%v", err) } - for _, ingress := range container.Ingress { + for _, ingress := range cp.Spec.Ingress { if ingress.IPAddress == addressStr { return types.Bool(true) } @@ -80,12 +80,12 @@ func (l *nnLibrary) isDomainInEgress(containerID, domain ref.Val) ref.Val { return types.MaybeNoSuchOverloadErr(domain) } - container, err := profilehelper.GetContainerNetworkNeighborhood(l.objectCache, containerIDStr) + cp, _, err := profilehelper.GetContainerProfile(l.objectCache, containerIDStr) if err != nil { return cache.NewProfileNotAvailableErr("%v", err) } - for _, egress := range container.Egress { + for _, egress := range cp.Spec.Egress { if slices.Contains(egress.DNSNames, domainStr) || egress.DNS == domainStr { return types.Bool(true) } @@ -108,12 +108,12 @@ func (l *nnLibrary) isDomainInIngress(containerID, domain ref.Val) ref.Val { return types.MaybeNoSuchOverloadErr(domain) } - container, err := profilehelper.GetContainerNetworkNeighborhood(l.objectCache, containerIDStr) + cp, _, err := profilehelper.GetContainerProfile(l.objectCache, containerIDStr) if err != nil { return cache.NewProfileNotAvailableErr("%v", err) } - for _, ingress := range container.Ingress { + for _, ingress := range cp.Spec.Ingress { if slices.Contains(ingress.DNSNames, domainStr) { return types.Bool(true) } @@ -144,12 +144,12 @@ func (l *nnLibrary) wasAddressPortProtocolInEgress(containerID, address, port, p return types.MaybeNoSuchOverloadErr(protocol) } - container, err := profilehelper.GetContainerNetworkNeighborhood(l.objectCache, containerIDStr) + cp, _, err := profilehelper.GetContainerProfile(l.objectCache, containerIDStr) if err != nil { return cache.NewProfileNotAvailableErr("%v", err) } - for _, egress := range container.Egress { + for _, egress := range cp.Spec.Egress { if egress.IPAddress == addressStr { for _, portInfo := range egress.Ports { if portInfo.Protocol == v1beta1.Protocol(protocolStr) && portInfo.Port != nil && *portInfo.Port == int32(portInt) { @@ -184,12 +184,12 @@ func (l *nnLibrary) wasAddressPortProtocolInIngress(containerID, address, port, return types.MaybeNoSuchOverloadErr(protocol) } - container, err := profilehelper.GetContainerNetworkNeighborhood(l.objectCache, containerIDStr) + cp, _, err := profilehelper.GetContainerProfile(l.objectCache, containerIDStr) if err != nil { return cache.NewProfileNotAvailableErr("%v", err) } - for _, ingress := range container.Ingress { + for _, ingress := range cp.Spec.Ingress { if ingress.IPAddress == addressStr { for _, portInfo := range ingress.Ports { if portInfo.Protocol == v1beta1.Protocol(protocolStr) && portInfo.Port != nil && *portInfo.Port == int32(portInt) { diff --git a/pkg/rulemanager/profilehelper/profilehelper.go b/pkg/rulemanager/profilehelper/profilehelper.go index f177bb0a94..0f4d5ed0e3 100644 --- a/pkg/rulemanager/profilehelper/profilehelper.go +++ b/pkg/rulemanager/profilehelper/profilehelper.go @@ -9,58 +9,19 @@ import ( corev1 "k8s.io/api/core/v1" ) -func GetApplicationProfile(containerID string, objectCache objectcache.ObjectCache) (*v1beta1.ApplicationProfile, error) { - ap := objectCache.ApplicationProfileCache().GetApplicationProfile(containerID) - if ap == nil { - return nil, errors.New("no profile available") +// GetContainerProfile returns the ContainerProfile for a containerID plus its +// SyncChecksumMetadataKey annotation. This is the forward API; legacy callers +// go through the shims below until step 6c deletes them. +func GetContainerProfile(objectCache objectcache.ObjectCache, containerID string) (*v1beta1.ContainerProfile, string, error) { + cpc := objectCache.ContainerProfileCache() + if cpc == nil { + return nil, "", errors.New("no container profile cache available") } - return ap, nil -} - -func GetNetworkNeighborhood(containerID string, objectCache objectcache.ObjectCache) (*v1beta1.NetworkNeighborhood, error) { - nn := objectCache.NetworkNeighborhoodCache().GetNetworkNeighborhood(containerID) - if nn == nil { - return nil, errors.New("no profile available") - } - return nn, nil -} - -func GetContainerFromApplicationProfile(ap *v1beta1.ApplicationProfile, containerName string) (v1beta1.ApplicationProfileContainer, error) { - for _, s := range ap.Spec.Containers { - if s.Name == containerName { - return s, nil - } - } - for _, s := range ap.Spec.InitContainers { - if s.Name == containerName { - return s, nil - } - } - for _, s := range ap.Spec.EphemeralContainers { - if s.Name == containerName { - return s, nil - } + cp := cpc.GetContainerProfile(containerID) + if cp == nil { + return nil, "", errors.New("no profile available") } - return v1beta1.ApplicationProfileContainer{}, errors.New("container not found") -} - -func GetContainerFromNetworkNeighborhood(nn *v1beta1.NetworkNeighborhood, containerName string) (v1beta1.NetworkNeighborhoodContainer, error) { - for _, c := range nn.Spec.Containers { - if c.Name == containerName { - return c, nil - } - } - for _, c := range nn.Spec.InitContainers { - if c.Name == containerName { - return c, nil - } - } - for _, c := range nn.Spec.EphemeralContainers { - if c.Name == containerName { - return c, nil - } - } - return v1beta1.NetworkNeighborhoodContainer{}, errors.New("container not found") + return cp, cp.Annotations[helpers.SyncChecksumMetadataKey], nil } func GetContainerName(objectCache objectcache.ObjectCache, containerID string) string { @@ -92,40 +53,3 @@ func GetPodSpec(objectCache objectcache.ObjectCache, containerID string) (*corev return podSpec, nil } -func GetContainerApplicationProfile(objectCache objectcache.ObjectCache, containerID string) (v1beta1.ApplicationProfileContainer, string, error) { - ap, err := GetApplicationProfile(containerID, objectCache) - if err != nil { - return v1beta1.ApplicationProfileContainer{}, "", err - } - - containerName := GetContainerName(objectCache, containerID) - if containerName == "" { - return v1beta1.ApplicationProfileContainer{}, "", errors.New("container name not found") - } - - container, err := GetContainerFromApplicationProfile(ap, containerName) - if err != nil { - return v1beta1.ApplicationProfileContainer{}, "", err - } - - return container, ap.Annotations[helpers.SyncChecksumMetadataKey], nil -} - -func GetContainerNetworkNeighborhood(objectCache objectcache.ObjectCache, containerID string) (v1beta1.NetworkNeighborhoodContainer, error) { - nn, err := GetNetworkNeighborhood(containerID, objectCache) - if err != nil { - return v1beta1.NetworkNeighborhoodContainer{}, err - } - - containerName := GetContainerName(objectCache, containerID) - if containerName == "" { - return v1beta1.NetworkNeighborhoodContainer{}, errors.New("container name not found") - } - - container, err := GetContainerFromNetworkNeighborhood(nn, containerName) - if err != nil { - return v1beta1.NetworkNeighborhoodContainer{}, err - } - - return container, nil -} diff --git a/pkg/rulemanager/rule_manager.go b/pkg/rulemanager/rule_manager.go index 7fde0990ad..a14a5ee86b 100644 --- a/pkg/rulemanager/rule_manager.go +++ b/pkg/rulemanager/rule_manager.go @@ -200,7 +200,7 @@ func (rm *RuleManager) ReportEnrichedEvent(enrichedEvent *events.EnrichedEvent) return } - _, apChecksum, err := profilehelper.GetContainerApplicationProfile(rm.objectCache, enrichedEvent.ContainerID) + _, apChecksum, err := profilehelper.GetContainerProfile(rm.objectCache, enrichedEvent.ContainerID) profileExists = err == nil // Early exit if monitoring is disabled for this context - skip rule evaluation @@ -345,9 +345,9 @@ func (rm *RuleManager) HasApplicableRuleBindings(namespace, name string) bool { func (rm *RuleManager) HasFinalApplicationProfile(pod *corev1.Pod) bool { for _, c := range utils.GetContainerStatuses(pod.Status) { - ap := rm.objectCache.ApplicationProfileCache().GetApplicationProfile(utils.TrimRuntimePrefix(c.ContainerID)) - if ap != nil { - if status, ok := ap.Annotations[helpersv1.StatusMetadataKey]; ok { + cp := rm.objectCache.ContainerProfileCache().GetContainerProfile(utils.TrimRuntimePrefix(c.ContainerID)) + if cp != nil { + if status, ok := cp.Annotations[helpersv1.StatusMetadataKey]; ok { // in theory, only completed profiles are stored in cache, but we check anyway return status == helpersv1.Completed } @@ -410,12 +410,12 @@ func (rm *RuleManager) EvaluatePolicyRulesForEvent(eventType utils.EventType, ev } func (rm *RuleManager) validateRulePolicy(rule typesv1.Rule, event utils.K8sEvent, containerID string) bool { - ap, _, err := profilehelper.GetContainerApplicationProfile(rm.objectCache, containerID) + cp, _, err := profilehelper.GetContainerProfile(rm.objectCache, containerID) if err != nil { return false } - allowed, err := rm.rulePolicyValidator.Validate(rule.ID, event.(utils.EnrichEvent).GetComm(), &ap) + allowed, err := rm.rulePolicyValidator.Validate(rule.ID, event.(utils.EnrichEvent).GetComm(), cp) if err != nil { logger.L().Error("RuleManager - failed to validate rule policy", helpers.Error(err)) return false diff --git a/pkg/rulemanager/ruleadapters/creator.go b/pkg/rulemanager/ruleadapters/creator.go index 9420569f7f..75783f9d32 100644 --- a/pkg/rulemanager/ruleadapters/creator.go +++ b/pkg/rulemanager/ruleadapters/creator.go @@ -145,7 +145,7 @@ func (r *RuleFailureCreator) setProfileMetadata(rule typesv1.Rule, ruleFailure * switch profileType { case armotypes.ApplicationProfile: - state := objectCache.ApplicationProfileCache().GetApplicationProfileState(triggerEvent.GetContainerID()) + state := objectCache.ContainerProfileCache().GetContainerProfileState(triggerEvent.GetContainerID()) if state != nil { profileMetadata := &armotypes.ProfileMetadata{ Status: state.Status, @@ -162,7 +162,7 @@ func (r *RuleFailureCreator) setProfileMetadata(rule typesv1.Rule, ruleFailure * } case armotypes.NetworkProfile: - state := objectCache.NetworkNeighborhoodCache().GetNetworkNeighborhoodState(triggerEvent.GetContainerID()) + state := objectCache.ContainerProfileCache().GetContainerProfileState(triggerEvent.GetContainerID()) if state != nil { profileMetadata := &armotypes.ProfileMetadata{ Status: state.Status, diff --git a/pkg/rulemanager/rulepolicy.go b/pkg/rulemanager/rulepolicy.go index 9a58943b00..f5562b2b2c 100644 --- a/pkg/rulemanager/rulepolicy.go +++ b/pkg/rulemanager/rulepolicy.go @@ -20,12 +20,12 @@ func NewRulePolicyValidator(objectCache objectcache.ObjectCache) *RulePolicyVali } } -func (v *RulePolicyValidator) Validate(ruleId string, process string, ap *v1beta1.ApplicationProfileContainer) (bool, error) { - if _, ok := ap.PolicyByRuleId[ruleId]; !ok { +func (v *RulePolicyValidator) Validate(ruleId string, process string, cp *v1beta1.ContainerProfile) (bool, error) { + if _, ok := cp.Spec.PolicyByRuleId[ruleId]; !ok { return false, nil } - if policy, ok := ap.PolicyByRuleId[ruleId]; ok { + if policy, ok := cp.Spec.PolicyByRuleId[ruleId]; ok { if policy.AllowedContainer || slices.Contains(policy.AllowedProcesses, process) { return true, nil } diff --git a/pkg/sbommanager/v1/sbom_manager.go b/pkg/sbommanager/v1/sbom_manager.go index 577e4e1a61..2f6d059b93 100644 --- a/pkg/sbommanager/v1/sbom_manager.go +++ b/pkg/sbommanager/v1/sbom_manager.go @@ -17,6 +17,7 @@ import ( "github.com/DmitriyVTitov/size" "github.com/anchore/syft/syft" + "github.com/anchore/syft/syft/cataloging" "github.com/anchore/syft/syft/cataloging/pkgcataloging" sbomcataloger "github.com/anchore/syft/syft/pkg/cataloger/sbom" "github.com/aquilax/truncate" @@ -471,6 +472,13 @@ func (s *SbomManager) processContainerWithMetadata(notif containercollection.Pub sbomCfg := syft.DefaultCreateSBOMConfig() sbomCfg.ToolName = "syft" sbomCfg.ToolVersion = s.version + sbomCfg = sbomCfg.WithCatalogerSelection( + cataloging.NewSelectionRequest().WithRemovals( + "file-digest-cataloger", + "file-metadata-cataloger", + "file-executable-cataloger", + ), + ) if s.cfg.EnableEmbeddedSboms { sbomCfg.WithCatalogers(pkgcataloging.NewCatalogerReference(sbomcataloger.NewCataloger(), []string{pkgcataloging.ImageTag})) } diff --git a/pkg/sbomscanner/v1/server.go b/pkg/sbomscanner/v1/server.go index 1b105bb286..360d67c70d 100644 --- a/pkg/sbomscanner/v1/server.go +++ b/pkg/sbomscanner/v1/server.go @@ -9,6 +9,7 @@ import ( "time" "github.com/anchore/syft/syft" + "github.com/anchore/syft/syft/cataloging" "github.com/anchore/syft/syft/cataloging/pkgcataloging" sbomcataloger "github.com/anchore/syft/syft/pkg/cataloger/sbom" "github.com/kubescape/go-logger" @@ -59,6 +60,13 @@ func (s *scannerServer) CreateSBOM(ctx context.Context, req *pb.CreateSBOMReques cfg := syft.DefaultCreateSBOMConfig() cfg.ToolName = "syft" cfg.ToolVersion = s.version + cfg = cfg.WithCatalogerSelection( + cataloging.NewSelectionRequest().WithRemovals( + "file-digest-cataloger", + "file-metadata-cataloger", + "file-executable-cataloger", + ), + ) if req.EnableEmbeddedSboms { cfg.WithCatalogers(pkgcataloging.NewCatalogerReference(sbomcataloger.NewCataloger(), []string{pkgcataloging.ImageTag})) } diff --git a/pkg/storage/storage_interface.go b/pkg/storage/storage_interface.go index 374b9ead8e..9a1c8125f1 100644 --- a/pkg/storage/storage_interface.go +++ b/pkg/storage/storage_interface.go @@ -1,6 +1,8 @@ package storage import ( + "context" + "github.com/kubescape/storage/pkg/apis/softwarecomposition/v1beta1" spdxv1beta1 "github.com/kubescape/storage/pkg/generated/clientset/versioned/typed/softwarecomposition/v1beta1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -8,10 +10,11 @@ import ( ) type ProfileClient interface { - GetApplicationProfile(namespace, name string) (*v1beta1.ApplicationProfile, error) - GetNetworkNeighborhood(namespace, name string) (*v1beta1.NetworkNeighborhood, error) - ListApplicationProfiles(namespace string, limit int64, cont string) (*v1beta1.ApplicationProfileList, error) - ListNetworkNeighborhoods(namespace string, limit int64, cont string) (*v1beta1.NetworkNeighborhoodList, error) + GetApplicationProfile(ctx context.Context, namespace, name string) (*v1beta1.ApplicationProfile, error) + GetNetworkNeighborhood(ctx context.Context, namespace, name string) (*v1beta1.NetworkNeighborhood, error) + GetContainerProfile(ctx context.Context, namespace, name string) (*v1beta1.ContainerProfile, error) + ListApplicationProfiles(ctx context.Context, namespace string, limit int64, cont string) (*v1beta1.ApplicationProfileList, error) + ListNetworkNeighborhoods(ctx context.Context, namespace string, limit int64, cont string) (*v1beta1.NetworkNeighborhoodList, error) } // ProfileCreator defines the interface for creating container profiles diff --git a/pkg/storage/storage_mock.go b/pkg/storage/storage_mock.go index 1f1c0dcbc2..13e96f3aaf 100644 --- a/pkg/storage/storage_mock.go +++ b/pkg/storage/storage_mock.go @@ -1,6 +1,8 @@ package storage import ( + "context" + "github.com/kubescape/storage/pkg/apis/softwarecomposition/v1beta1" spdxv1beta1 "github.com/kubescape/storage/pkg/apis/softwarecomposition/v1beta1" beta1 "github.com/kubescape/storage/pkg/generated/clientset/versioned/typed/softwarecomposition/v1beta1" @@ -35,12 +37,21 @@ func (sc *StorageHttpClientMock) CreateSBOM(SBOM *v1beta1.SBOMSyft) (*v1beta1.SB return SBOM, nil } -func (sc *StorageHttpClientMock) GetApplicationProfile(_, _ string) (*spdxv1beta1.ApplicationProfile, error) { +func (sc *StorageHttpClientMock) GetContainerProfile(_ context.Context, namespace, name string) (*v1beta1.ContainerProfile, error) { + for _, p := range sc.ContainerProfiles { + if p != nil && p.Namespace == namespace && p.Name == name { + return p, nil + } + } + return nil, nil +} + +func (sc *StorageHttpClientMock) GetApplicationProfile(_ context.Context, _, _ string) (*spdxv1beta1.ApplicationProfile, error) { //TODO implement me panic("implement me") } -func (sc *StorageHttpClientMock) GetNetworkNeighborhood(_, _ string) (*spdxv1beta1.NetworkNeighborhood, error) { +func (sc *StorageHttpClientMock) GetNetworkNeighborhood(_ context.Context, _, _ string) (*spdxv1beta1.NetworkNeighborhood, error) { //TODO implement me panic("implement me") } @@ -52,12 +63,12 @@ func (sc *StorageHttpClientMock) GetStorageClient() beta1.SpdxV1beta1Interface { return nil } -func (sc *StorageHttpClientMock) ListApplicationProfiles(namespace string, limit int64, cont string) (*spdxv1beta1.ApplicationProfileList, error) { +func (sc *StorageHttpClientMock) ListApplicationProfiles(_ context.Context, namespace string, limit int64, cont string) (*spdxv1beta1.ApplicationProfileList, error) { //TODO implement me panic("implement me") } -func (sc *StorageHttpClientMock) ListNetworkNeighborhoods(namespace string, limit int64, cont string) (*spdxv1beta1.NetworkNeighborhoodList, error) { +func (sc *StorageHttpClientMock) ListNetworkNeighborhoods(_ context.Context, namespace string, limit int64, cont string) (*spdxv1beta1.NetworkNeighborhoodList, error) { //TODO implement me panic("implement me") } diff --git a/pkg/storage/v1/applicationprofile.go b/pkg/storage/v1/applicationprofile.go index 96fa7e1bb0..39f0543288 100644 --- a/pkg/storage/v1/applicationprofile.go +++ b/pkg/storage/v1/applicationprofile.go @@ -7,12 +7,12 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) -func (sc *Storage) GetApplicationProfile(namespace, name string) (*v1beta1.ApplicationProfile, error) { - return sc.storageClient.ApplicationProfiles(namespace).Get(context.Background(), name, metav1.GetOptions{}) +func (sc *Storage) GetApplicationProfile(ctx context.Context, namespace, name string) (*v1beta1.ApplicationProfile, error) { + return sc.storageClient.ApplicationProfiles(namespace).Get(ctx, name, metav1.GetOptions{}) } -func (sc *Storage) ListApplicationProfiles(namespace string, limit int64, cont string) (*v1beta1.ApplicationProfileList, error) { - return sc.storageClient.ApplicationProfiles(namespace).List(context.Background(), metav1.ListOptions{ +func (sc *Storage) ListApplicationProfiles(ctx context.Context, namespace string, limit int64, cont string) (*v1beta1.ApplicationProfileList, error) { + return sc.storageClient.ApplicationProfiles(namespace).List(ctx, metav1.ListOptions{ Limit: limit, Continue: cont, }) diff --git a/pkg/storage/v1/containerprofile.go b/pkg/storage/v1/containerprofile.go index 620e42b70e..69fbc0ea5a 100644 --- a/pkg/storage/v1/containerprofile.go +++ b/pkg/storage/v1/containerprofile.go @@ -7,8 +7,12 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) -// CreateContainerProfileDirect directly creates the profile without queuing -// This implements the ProfileCreator interface +func (sc *Storage) GetContainerProfile(ctx context.Context, namespace, name string) (*v1beta1.ContainerProfile, error) { + return sc.storageClient.ContainerProfiles(namespace).Get(ctx, name, metav1.GetOptions{}) +} + +// CreateContainerProfileDirect directly creates the profile without queuing. +// This implements the ProfileCreator interface. func (sc *Storage) CreateContainerProfileDirect(profile *v1beta1.ContainerProfile) error { // Apply name modifications if needed (keeping your existing logic) // sc.modifyNameP(&profile.Name) diff --git a/pkg/storage/v1/networkneighborhood.go b/pkg/storage/v1/networkneighborhood.go index bfe52b2e3d..cec12b97e4 100644 --- a/pkg/storage/v1/networkneighborhood.go +++ b/pkg/storage/v1/networkneighborhood.go @@ -7,12 +7,12 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) -func (sc *Storage) GetNetworkNeighborhood(namespace, name string) (*v1beta1.NetworkNeighborhood, error) { - return sc.storageClient.NetworkNeighborhoods(namespace).Get(context.Background(), name, metav1.GetOptions{}) +func (sc *Storage) GetNetworkNeighborhood(ctx context.Context, namespace, name string) (*v1beta1.NetworkNeighborhood, error) { + return sc.storageClient.NetworkNeighborhoods(namespace).Get(ctx, name, metav1.GetOptions{}) } -func (sc *Storage) ListNetworkNeighborhoods(namespace string, limit int64, cont string) (*v1beta1.NetworkNeighborhoodList, error) { - return sc.storageClient.NetworkNeighborhoods(namespace).List(context.Background(), metav1.ListOptions{ +func (sc *Storage) ListNetworkNeighborhoods(ctx context.Context, namespace string, limit int64, cont string) (*v1beta1.NetworkNeighborhoodList, error) { + return sc.storageClient.NetworkNeighborhoods(namespace).List(ctx, metav1.ListOptions{ Limit: limit, Continue: cont, })