From 32e1ec00eb20d009ccc75d7d138e0cba34d1c88b Mon Sep 17 00:00:00 2001 From: entlein Date: Thu, 23 Apr 2026 10:16:10 +0200 Subject: [PATCH 01/16] storage PR 1 - test if it has clean git history Signed-off-by: entlein --- .../file/dynamicpathdetector/analyzer.go | 37 ++++++++++++++++--- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/pkg/registry/file/dynamicpathdetector/analyzer.go b/pkg/registry/file/dynamicpathdetector/analyzer.go index a7a01dadc..f30a0b14d 100644 --- a/pkg/registry/file/dynamicpathdetector/analyzer.go +++ b/pkg/registry/file/dynamicpathdetector/analyzer.go @@ -2,9 +2,23 @@ package dynamicpathdetector import ( "path" - "strings" + "sync" ) +// bufPool reuses byte-slice capacity across AnalyzePath calls. strings.Builder +// was tempting but its Reset() discards the buffer, defeating the pool; a raw +// []byte with len=0/cap-preserved survives reuse. Steady-state per-call cost +// is one string allocation (the final string conversion) and nothing else. +// Thread-safe by virtue of sync.Pool. +const defaultBuildBufCap = 128 + +var bufPool = sync.Pool{ + New: func() any { + b := make([]byte, 0, defaultBuildBufCap) + return &b + }, +} + func NewPathAnalyzer(threshold int) *PathAnalyzer { return &PathAnalyzer{ RootNodes: make(map[string]*SegmentNode), @@ -27,7 +41,14 @@ func (ua *PathAnalyzer) AnalyzePath(p, identifier string) (string, error) { } func (ua *PathAnalyzer) processSegments(node *SegmentNode, p string) string { - var result strings.Builder + // Acquire a pooled byte-slice. len=0, cap preserved from previous reuse. + bufPtr := bufPool.Get().(*[]byte) + buf := (*bufPtr)[:0] + if cap(buf) < len(p) { + // Pooled capacity is too small for this input; grow once. + buf = make([]byte, 0, len(p)+16) + } + currentNode := node i := 0 for { @@ -38,14 +59,20 @@ func (ua *PathAnalyzer) processSegments(node *SegmentNode, p string) string { segment := p[start:i] currentNode = ua.processSegment(currentNode, segment) ua.updateNodeStats(currentNode) - result.WriteString(currentNode.SegmentName) + buf = append(buf, currentNode.SegmentName...) i++ if len(p) < i { break } - result.WriteByte('/') + buf = append(buf, '/') } - return result.String() + + // string(buf) always copies, so it is safe to return the pool capacity + // immediately afterwards — the returned string does not alias buf. + out := string(buf) + *bufPtr = buf + bufPool.Put(bufPtr) + return out } func (ua *PathAnalyzer) processSegment(node *SegmentNode, segment string) *SegmentNode { From 17ff5114d041f94f7d36b0bcb6281ea99ebcc6d0 Mon Sep 17 00:00:00 2001 From: entlein Date: Thu, 23 Apr 2026 10:49:27 +0200 Subject: [PATCH 02/16] storage PR 1 - test the overhead introduced by the changes Signed-off-by: entlein --- .../file/dynamicpathdetector/analyzer.go | 67 ++++++- .../dynamicpathdetector/tests/profile_test.go | 167 ++++++++++++++++++ .../file/dynamicpathdetector/types.go | 53 +++++- 3 files changed, 279 insertions(+), 8 deletions(-) create mode 100644 pkg/registry/file/dynamicpathdetector/tests/profile_test.go diff --git a/pkg/registry/file/dynamicpathdetector/analyzer.go b/pkg/registry/file/dynamicpathdetector/analyzer.go index f30a0b14d..d90d4d93a 100644 --- a/pkg/registry/file/dynamicpathdetector/analyzer.go +++ b/pkg/registry/file/dynamicpathdetector/analyzer.go @@ -19,11 +19,61 @@ var bufPool = sync.Pool{ }, } +// NewPathAnalyzer builds an analyzer with a single global collapse threshold +// and no per-prefix overrides — equivalent behaviour to the pre-CollapseConfig +// world. Retained so existing callers don't need to change. func NewPathAnalyzer(threshold int) *PathAnalyzer { + return NewPathAnalyzerWithConfigs(threshold, nil) +} + +// NewPathAnalyzerWithConfigs builds an analyzer whose collapse threshold can +// vary per path prefix. defaultThreshold applies when no CollapseConfig in +// configs matches; configs are checked longest-prefix-wins at walk time. +// +// configs is copied so the caller can reuse or mutate the slice without +// affecting the analyzer. +func NewPathAnalyzerWithConfigs(defaultThreshold int, configs []CollapseConfig) *PathAnalyzer { + copied := make([]CollapseConfig, len(configs)) + copy(copied, configs) return &PathAnalyzer{ - RootNodes: make(map[string]*SegmentNode), - threshold: threshold, + RootNodes: make(map[string]*SegmentNode), + threshold: defaultThreshold, + configs: copied, + defaultCfg: CollapseConfig{Prefix: "/", Threshold: defaultThreshold}, + } +} + +// effectiveThreshold returns the collapse threshold applicable to the given +// path prefix, picking the longest matching CollapseConfig or falling back +// to the analyzer's default. Loop is O(len(configs)) and configs is small +// (five entries in practice); no allocations. +func (ua *PathAnalyzer) effectiveThreshold(pathPrefix string) int { + bestLen := 0 + best := ua.threshold + for i := range ua.configs { + c := &ua.configs[i] + if len(c.Prefix) >= bestLen && hasPrefixAtBoundary(pathPrefix, c.Prefix) { + bestLen = len(c.Prefix) + best = c.Threshold + } + } + return best +} + +// hasPrefixAtBoundary is like strings.HasPrefix but only matches if the +// prefix ends at a path boundary (either pathPrefix == prefix, or the next +// rune in pathPrefix is '/'). Prevents "/etc" matching "/etcd". +func hasPrefixAtBoundary(pathPrefix, prefix string) bool { + if len(pathPrefix) < len(prefix) { + return false + } + if pathPrefix[:len(prefix)] != prefix { + return false + } + if len(pathPrefix) == len(prefix) { + return true } + return pathPrefix[len(prefix)] == '/' } func (ua *PathAnalyzer) AnalyzePath(p, identifier string) (string, error) { @@ -58,7 +108,10 @@ func (ua *PathAnalyzer) processSegments(node *SegmentNode, p string) string { } segment := p[start:i] currentNode = ua.processSegment(currentNode, segment) - ua.updateNodeStats(currentNode) + // Look up the effective collapse threshold at the prefix we've + // just walked to (p[:i]). Allocation-free — the slice aliases + // the caller's original path string. + ua.updateNodeStats(currentNode, ua.effectiveThreshold(p[:i])) buf = append(buf, currentNode.SegmentName...) i++ if len(p) < i { @@ -131,8 +184,12 @@ func (ua *PathAnalyzer) createDynamicNode(node *SegmentNode) *SegmentNode { return dynamicNode } -func (ua *PathAnalyzer) updateNodeStats(node *SegmentNode) { - if node.Count > ua.threshold && !node.IsNextDynamic() { +// updateNodeStats collapses node's children into a single ⋯ (DynamicIdentifier) +// child once the number of distinct children exceeds the provided threshold. +// Threshold is passed in by the caller so per-prefix overrides (via +// CollapseConfig) can take effect without this function knowing about them. +func (ua *PathAnalyzer) updateNodeStats(node *SegmentNode, threshold int) { + if node.Count > threshold && !node.IsNextDynamic() { dynamicChild := &SegmentNode{ SegmentName: DynamicIdentifier, Count: 0, diff --git a/pkg/registry/file/dynamicpathdetector/tests/profile_test.go b/pkg/registry/file/dynamicpathdetector/tests/profile_test.go new file mode 100644 index 000000000..7921abc17 --- /dev/null +++ b/pkg/registry/file/dynamicpathdetector/tests/profile_test.go @@ -0,0 +1,167 @@ +package dynamicpathdetectortests + +// Profiling helpers for the path analyzer hot path. +// +// Invocation: +// +// go test ./pkg/registry/file/dynamicpathdetector/tests/ \ +// -run=TestProfileAnalyzePath \ +// -profile-out=/tmp/analyzer-profile \ +// -profile-iters=200000 +// +// Writes cpu.out, mem.out (heap alloc_space sampled) and goroutine.out +// to the directory, then prints the top allocators to stdout so the +// test log alone is enough to see regressions. Skipped unless +// -profile-out is set, so it stays cheap on a regular `go test ./...`. +// +// Helper benchmark BenchmarkAnalyzePathWarm exercises the steady state +// where the trie is already populated — the interesting mode for +// zero-alloc analysis, because first-insert naturally allocates. + +import ( + "flag" + "fmt" + "os" + "path/filepath" + "runtime" + "runtime/pprof" + "testing" + + "github.com/kubescape/storage/pkg/registry/file/dynamicpathdetector" +) + +var profileOutDir = flag.String("profile-out", "", + "directory to write analyzer profiles into (e.g. /tmp/analyzer-profile); if empty the profile test is skipped") +var profileIters = flag.Int("profile-iters", 100000, + "number of AnalyzePath calls to execute during the profile run") + +// TestProfileAnalyzePath runs a large in-process workload against AnalyzePath +// and writes CPU + heap profiles. Intended for interactive iteration during +// the zero-alloc rewrite, not for CI (requires -profile-out to run). +func TestProfileAnalyzePath(t *testing.T) { + if *profileOutDir == "" { + t.Skip("set -profile-out= to enable the profile test") + } + if err := os.MkdirAll(*profileOutDir, 0o755); err != nil { + t.Fatalf("mkdir profile dir: %v", err) + } + + // Generate a representative mixed workload once, outside the measured + // section, so its allocations don't show up in the profile. + paths := generateMixedPaths(10000, 0) + analyzer := dynamicpathdetector.NewPathAnalyzer(100) + identifier := "profile" + + // Warm up the analyzer so the trie is populated. Steady-state calls + // are the interesting regime for zero-alloc (cold inserts always + // allocate a new node). + for i := 0; i < len(paths); i++ { + if _, err := analyzer.AnalyzePath(paths[i], identifier); err != nil { + t.Fatalf("warmup AnalyzePath: %v", err) + } + } + + // CPU profile. + cpuPath := filepath.Join(*profileOutDir, "cpu.out") + cpuFile, err := os.Create(cpuPath) + if err != nil { + t.Fatalf("create cpu profile: %v", err) + } + if err := pprof.StartCPUProfile(cpuFile); err != nil { + t.Fatalf("start cpu profile: %v", err) + } + + // Force a clean GC baseline so MemStats numbers reflect only the + // measured section. + runtime.GC() + var before, after runtime.MemStats + runtime.ReadMemStats(&before) + + for i := 0; i < *profileIters; i++ { + if _, err := analyzer.AnalyzePath(paths[i%len(paths)], identifier); err != nil { + pprof.StopCPUProfile() + cpuFile.Close() + t.Fatalf("AnalyzePath iter %d: %v", i, err) + } + } + + pprof.StopCPUProfile() + cpuFile.Close() + + runtime.ReadMemStats(&after) + + // Heap profile (alloc_space). + memPath := filepath.Join(*profileOutDir, "mem.out") + memFile, err := os.Create(memPath) + if err != nil { + t.Fatalf("create mem profile: %v", err) + } + if err := pprof.Lookup("allocs").WriteTo(memFile, 0); err != nil { + t.Fatalf("write mem profile: %v", err) + } + memFile.Close() + + // Goroutine snapshot (useful when debugging leaks). + goPath := filepath.Join(*profileOutDir, "goroutine.out") + goFile, err := os.Create(goPath) + if err != nil { + t.Fatalf("create goroutine profile: %v", err) + } + if err := pprof.Lookup("goroutine").WriteTo(goFile, 0); err != nil { + t.Fatalf("write goroutine profile: %v", err) + } + goFile.Close() + + totalBytes := after.TotalAlloc - before.TotalAlloc + totalMallocs := after.Mallocs - before.Mallocs + t.Logf("AnalyzePath: %d iterations", *profileIters) + t.Logf(" bytes allocated: %d total, %.2f B/call", totalBytes, float64(totalBytes)/float64(*profileIters)) + t.Logf(" heap objects : %d total, %.2f allocs/call", totalMallocs, float64(totalMallocs)/float64(*profileIters)) + t.Logf(" wrote profiles to %s", *profileOutDir) + t.Logf(" inspect with: go tool pprof -top -alloc_space %s", memPath) + t.Logf(" go tool pprof -top %s", cpuPath) +} + +// BenchmarkAnalyzePathWarm is a companion to BenchmarkAnalyzePath that +// pre-populates the analyzer's trie, so every iteration exercises the +// steady-state walk instead of first-insert. Steady-state is the regime +// we care about for zero-alloc — cold inserts will always allocate a +// new SegmentNode. +func BenchmarkAnalyzePathWarm(b *testing.B) { + paths := generateMixedPaths(10000, 0) + analyzer := dynamicpathdetector.NewPathAnalyzer(100) + identifier := "warm" + for _, p := range paths { + if _, err := analyzer.AnalyzePath(p, identifier); err != nil { + b.Fatalf("warmup: %v", err) + } + } + + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + if _, err := analyzer.AnalyzePath(paths[i%len(paths)], identifier); err != nil { + b.Fatalf("AnalyzePath: %v", err) + } + } +} + +// BenchmarkAnalyzePathCold is the counterpart: each iteration uses a +// fresh analyzer, so it measures the cost including node allocation. +// The two benchmarks together bracket the true cost envelope. +func BenchmarkAnalyzePathCold(b *testing.B) { + paths := generateMixedPaths(10000, 0) + identifier := "cold" + + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + analyzer := dynamicpathdetector.NewPathAnalyzer(100) + if _, err := analyzer.AnalyzePath(paths[i%len(paths)], identifier); err != nil { + b.Fatalf("AnalyzePath: %v", err) + } + } +} + +// Ensure fmt is kept imported when future debugging prints land here. +var _ = fmt.Sprint diff --git a/pkg/registry/file/dynamicpathdetector/types.go b/pkg/registry/file/dynamicpathdetector/types.go index 1bd2d52ff..df25a86d3 100644 --- a/pkg/registry/file/dynamicpathdetector/types.go +++ b/pkg/registry/file/dynamicpathdetector/types.go @@ -1,6 +1,51 @@ package dynamicpathdetector -const DynamicIdentifier string = "\u22ef" +// --- Identifier constants --- +// DynamicIdentifier matches exactly one path segment (single-segment wildcard). +// WildcardIdentifier matches zero-or-more path segments (glob-style **). +const ( + DynamicIdentifier string = "⋯" // U+22EF: ⋯ + WildcardIdentifier string = "*" +) + +// --- Default collapse thresholds --- +// OpenDynamicThreshold is the fallback threshold used by AnalyzeOpens when +// no more-specific CollapseConfig matches the walked path prefix. +// EndpointDynamicThreshold is the counterpart for AnalyzeEndpoints. +const ( + OpenDynamicThreshold = 50 + EndpointDynamicThreshold = 100 +) + +// --- Collapse configuration --- +// CollapseConfig controls the threshold at which children of a trie node +// (under the given path Prefix) are collapsed into a dynamic node (⋯). +// Longest-prefix wins at analysis time. +type CollapseConfig struct { + Prefix string + Threshold int +} + +// DefaultCollapseConfigs carries the per-prefix thresholds we've found +// useful in practice. These are the defaults wired into AnalyzeOpens; a +// caller can pass a different slice via NewPathAnalyzerWithConfigs if +// they want to tune for their workload. +var DefaultCollapseConfigs = []CollapseConfig{ + {Prefix: "/etc", Threshold: 100}, + {Prefix: "/etc/apache2", Threshold: 5}, // webapp test workload has a tight apache tree + {Prefix: "/opt", Threshold: 5}, + {Prefix: "/var/run", Threshold: 3}, + {Prefix: "/app", Threshold: 1}, +} + +// DefaultCollapseConfig is the global fallback used when no CollapseConfig +// prefix matches the walked path. +var DefaultCollapseConfig = CollapseConfig{ + Prefix: "/", + Threshold: OpenDynamicThreshold, +} + +// --- Trie types --- type SegmentNode struct { SegmentName string @@ -9,8 +54,10 @@ type SegmentNode struct { } type PathAnalyzer struct { - RootNodes map[string]*SegmentNode - threshold int + RootNodes map[string]*SegmentNode + threshold int // fallback threshold when no config matches + configs []CollapseConfig // per-prefix overrides; longest prefix wins + defaultCfg CollapseConfig // explicit fallback; equivalent to {Prefix:"/", Threshold: threshold} } func (sn *SegmentNode) IsNextDynamic() bool { From 470582053c26047cadb2630e1d1fd8168b40e2ec Mon Sep 17 00:00:00 2001 From: entlein Date: Thu, 23 Apr 2026 11:03:09 +0200 Subject: [PATCH 03/16] storage PR 1 - this is the greedy collapse when there is already a WildcardSegment, I want to test this one e2e against actual component test before merge Signed-off-by: entlein --- .../file/dynamicpathdetector/analyzer.go | 55 +++++++++++++++---- 1 file changed, 45 insertions(+), 10 deletions(-) diff --git a/pkg/registry/file/dynamicpathdetector/analyzer.go b/pkg/registry/file/dynamicpathdetector/analyzer.go index d90d4d93a..52ea9e281 100644 --- a/pkg/registry/file/dynamicpathdetector/analyzer.go +++ b/pkg/registry/file/dynamicpathdetector/analyzer.go @@ -107,11 +107,10 @@ func (ua *PathAnalyzer) processSegments(node *SegmentNode, p string) string { i++ } segment := p[start:i] - currentNode = ua.processSegment(currentNode, segment) - // Look up the effective collapse threshold at the prefix we've - // just walked to (p[:i]). Allocation-free — the slice aliases - // the caller's original path string. - ua.updateNodeStats(currentNode, ua.effectiveThreshold(p[:i])) + // Effective threshold at this depth (allocation-free slice). + threshold := ua.effectiveThreshold(p[:i]) + currentNode = ua.processSegment(currentNode, segment, threshold) + ua.updateNodeStats(currentNode, threshold) buf = append(buf, currentNode.SegmentName...) i++ if len(p) < i { @@ -128,21 +127,36 @@ func (ua *PathAnalyzer) processSegments(node *SegmentNode, p string) string { return out } -func (ua *PathAnalyzer) processSegment(node *SegmentNode, segment string) *SegmentNode { +func (ua *PathAnalyzer) processSegment(node *SegmentNode, segment string, threshold int) *SegmentNode { if segment == DynamicIdentifier { return ua.handleDynamicSegment(node) - } else if node.IsNextDynamic() { + } + // Wildcard short-circuit: once a node has a * child, all paths through + // it go there. This is the glob-style "collapse everything below here" + // behaviour; set up either by threshold=1 (see below) or by a caller + // explicitly feeding a WildcardIdentifier segment. + if wildcardChild, exists := node.Children[WildcardIdentifier]; exists { + return wildcardChild + } + if node.IsNextDynamic() { if len(node.Children) > 1 { temp := node.Children[DynamicIdentifier] node.Children = map[string]*SegmentNode{} node.Children[DynamicIdentifier] = temp } return node.Children[DynamicIdentifier] - } else if child, exists := node.Children[segment]; exists { + } + if child, exists := node.Children[segment]; exists { return child - } else { - return ua.handleNewSegment(node, segment) } + // Threshold-1 short-circuit: a prefix explicitly configured to accept + // one unique child (CollapseConfig Threshold == 1) collapses to * on + // the first *new* segment rather than going through the ⋯ path. This + // matches the caller's intent of "anything under /app is noise". + if threshold == 1 { + return ua.createWildcardNode(node) + } + return ua.handleNewSegment(node, segment) } func (ua *PathAnalyzer) handleNewSegment(node *SegmentNode, segment string) *SegmentNode { @@ -164,6 +178,27 @@ func (ua *PathAnalyzer) handleDynamicSegment(node *SegmentNode) *SegmentNode { } } +// createWildcardNode replaces all of node's existing children with a single +// WildcardIdentifier (*) child, absorbing the existing subtree counts into it. +// Used for the threshold-1 short-circuit: once a prefix is configured to keep +// at most one unique child, any second unique value collapses the whole +// subtree to *. +func (ua *PathAnalyzer) createWildcardNode(node *SegmentNode) *SegmentNode { + wildcard := &SegmentNode{ + SegmentName: WildcardIdentifier, + Count: 0, + Children: make(map[string]*SegmentNode), + } + // Absorb any previously-accumulated children. Mirrors createDynamicNode. + for _, child := range node.Children { + shallowChildrenCopy(child, wildcard) + } + node.Children = map[string]*SegmentNode{ + WildcardIdentifier: wildcard, + } + return wildcard +} + func (ua *PathAnalyzer) createDynamicNode(node *SegmentNode) *SegmentNode { dynamicNode := &SegmentNode{ SegmentName: DynamicIdentifier, From d69f35b4afa0493a59840067cc20a6d3162241eb Mon Sep 17 00:00:00 2001 From: entlein Date: Thu, 23 Apr 2026 11:09:45 +0200 Subject: [PATCH 04/16] storage PR 1 - adjacent collapse of two ellipsis chars into one Wildcard, this requires functional requirements as it could defeat the intention of a user, if they really want just two subpath elements, to be discussed Signed-off-by: entlein --- .../file/dynamicpathdetector/analyzer.go | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/pkg/registry/file/dynamicpathdetector/analyzer.go b/pkg/registry/file/dynamicpathdetector/analyzer.go index 52ea9e281..ae69c2c62 100644 --- a/pkg/registry/file/dynamicpathdetector/analyzer.go +++ b/pkg/registry/file/dynamicpathdetector/analyzer.go @@ -119,6 +119,12 @@ func (ua *PathAnalyzer) processSegments(node *SegmentNode, p string) string { buf = append(buf, '/') } + // Post-process: collapse runs of adjacent DynamicIdentifier segments + // (e.g. "/a/⋯/⋯/b") into a single WildcardIdentifier ("/a/*/b"). Done + // in place by shrinking buf — zero allocation because the output is + // always shorter than the input. + buf = collapseAdjacentDynamic(buf) + // string(buf) always copies, so it is safe to return the pool capacity // immediately afterwards — the returned string does not alias buf. out := string(buf) @@ -127,6 +133,38 @@ func (ua *PathAnalyzer) processSegments(node *SegmentNode, p string) string { return out } +// collapseAdjacentDynamic compacts buf in place: any run of +// "⋯/⋯[/⋯…]" becomes a single "*". Returns a buf[:n] slice where n is +// the compacted length. Does not allocate; suitable for the hot path. +func collapseAdjacentDynamic(buf []byte) []byte { + // DynamicIdentifier is U+22EF, three UTF-8 bytes: 0xE2 0x8B 0xAF. + const d0, d1, d2 = 0xE2, 0x8B, 0xAF + const dynLen = 3 + isDyn := func(i int) bool { + return i+dynLen <= len(buf) && buf[i] == d0 && buf[i+1] == d1 && buf[i+2] == d2 + } + + out := 0 + i := 0 + for i < len(buf) { + // Need at least "⋯/⋯" (7 bytes) to trigger a collapse. + if isDyn(i) && i+dynLen+1+dynLen <= len(buf) && buf[i+dynLen] == '/' && isDyn(i+dynLen+1) { + buf[out] = '*' + out++ + // Consume "⋯/⋯" plus any further "/⋯" in the run. + i += dynLen + 1 + dynLen + for i+1+dynLen <= len(buf) && buf[i] == '/' && isDyn(i+1) { + i += 1 + dynLen + } + continue + } + buf[out] = buf[i] + out++ + i++ + } + return buf[:out] +} + func (ua *PathAnalyzer) processSegment(node *SegmentNode, segment string, threshold int) *SegmentNode { if segment == DynamicIdentifier { return ua.handleDynamicSegment(node) From f9c5d6bd2099ac3d547ec333463c26d575eb0283 Mon Sep 17 00:00:00 2001 From: entlein Date: Thu, 23 Apr 2026 11:15:40 +0200 Subject: [PATCH 05/16] storage PR 1 - resetting all DefaultCollapsConfigs to 50 only leaving /etc higher to test the functionality Signed-off-by: entlein --- pkg/registry/file/dynamicpathdetector/types.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pkg/registry/file/dynamicpathdetector/types.go b/pkg/registry/file/dynamicpathdetector/types.go index df25a86d3..02afc0c03 100644 --- a/pkg/registry/file/dynamicpathdetector/types.go +++ b/pkg/registry/file/dynamicpathdetector/types.go @@ -32,10 +32,10 @@ type CollapseConfig struct { // they want to tune for their workload. var DefaultCollapseConfigs = []CollapseConfig{ {Prefix: "/etc", Threshold: 100}, - {Prefix: "/etc/apache2", Threshold: 5}, // webapp test workload has a tight apache tree - {Prefix: "/opt", Threshold: 5}, - {Prefix: "/var/run", Threshold: 3}, - {Prefix: "/app", Threshold: 1}, + {Prefix: "/etc/apache2", Threshold: 50}, + {Prefix: "/opt", Threshold: 50}, + {Prefix: "/var/run", Threshold: 50}, + {Prefix: "/app", Threshold: 50}, } // DefaultCollapseConfig is the global fallback used when no CollapseConfig From d86673dda72a4adafaad0e95dbca513f8e1f763e Mon Sep 17 00:00:00 2001 From: entlein Date: Fri, 24 Apr 2026 16:33:16 +0200 Subject: [PATCH 06/16] syft downgrade, here we go; Signed-off-by: entlein --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index d8238bdec..bfa7eab96 100644 --- a/go.mod +++ b/go.mod @@ -6,7 +6,7 @@ go 1.25.8 require ( github.com/SergJa/jsonhash v0.0.0-20210531165746-fc45f346aa74 - github.com/anchore/syft v1.42.3 + github.com/anchore/syft v1.32.0 github.com/armosec/armoapi-go v0.0.696 github.com/armosec/utils-k8s-go v0.0.30 github.com/containers/common v0.63.0 diff --git a/go.sum b/go.sum index 4df8ea5bf..f3b7ba29f 100644 --- a/go.sum +++ b/go.sum @@ -88,8 +88,8 @@ github.com/anchore/packageurl-go v0.1.1-0.20250220190351-d62adb6e1115 h1:ZyRCmiE github.com/anchore/packageurl-go v0.1.1-0.20250220190351-d62adb6e1115/go.mod h1:KoYIv7tdP5+CC9VGkeZV4/vGCKsY55VvoG+5dadg4YI= github.com/anchore/stereoscope v0.1.22 h1:L807G/kk0WZzOCGuRGF7knxMKzwW2PGdbPVRystryd8= github.com/anchore/stereoscope v0.1.22/go.mod h1:FikPtAb/WnbqwgLHAvQA9O+fWez0K4pbjxzghz++iy4= -github.com/anchore/syft v1.42.3 h1:eIeeGyqfXm/C8wpBWU50xFbOjdL37VbLatMj9nEJ6n4= -github.com/anchore/syft v1.42.3/go.mod h1:i2PZ+276IdPcnd/n32aeIv849iO/QqdjRknbIc39yL0= +github.com/anchore/syft v1.32.0 h1:JcX9W+P/Xjv5DNg3TNBtwiEyZommuTaP16/NC9r0Yfo= +github.com/anchore/syft v1.32.0/go.mod h1:E6Kd4iBM2ljUOUQvSt7hVK6vBwaHkMXwcvBZmGMSY5o= github.com/andreyvit/diff v0.0.0-20170406064948-c7f18ee00883/go.mod h1:rCTlJbsFo29Kk6CurOXKm700vrz8f0KW0JNfpkRJY/8= github.com/anmitsu/go-shlex v0.0.0-20161002113705-648efa622239/go.mod h1:2FmKhYUyUczH0OGQWaF5ceTx0UBShxjsH6f8oGKYe2c= github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kdvxnR2qWY= From 6bef023942450f8cec2f95e615d632e014558962 Mon Sep 17 00:00:00 2001 From: entlein Date: Fri, 24 Apr 2026 17:28:08 +0200 Subject: [PATCH 07/16] stereoscope diff Signed-off-by: entlein --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index bfa7eab96..9587231d9 100644 --- a/go.mod +++ b/go.mod @@ -50,7 +50,7 @@ require ( github.com/acobaugh/osrelease v0.1.0 // indirect github.com/anchore/go-logger v0.0.0-20250318195838-07ae343dd722 // indirect github.com/anchore/packageurl-go v0.1.1-0.20250220190351-d62adb6e1115 // indirect - github.com/anchore/stereoscope v0.1.22 // indirect + github.com/anchore/stereoscope v0.1.9 // indirect github.com/antlr4-go/antlr/v4 v4.13.0 // indirect github.com/armosec/gojay v1.2.17 // indirect github.com/armosec/utils-go v0.0.58 // indirect diff --git a/go.sum b/go.sum index f3b7ba29f..41232a208 100644 --- a/go.sum +++ b/go.sum @@ -86,8 +86,8 @@ github.com/anchore/go-logger v0.0.0-20250318195838-07ae343dd722 h1:2SqmFgE7h+Ql4 github.com/anchore/go-logger v0.0.0-20250318195838-07ae343dd722/go.mod h1:oFuE8YuTCM+spgMXhePGzk3asS94yO9biUfDzVTFqNw= github.com/anchore/packageurl-go v0.1.1-0.20250220190351-d62adb6e1115 h1:ZyRCmiEjnoGJZ1+Ah0ZZ/mKKqNhGcUZBl0s7PTTDzvY= github.com/anchore/packageurl-go v0.1.1-0.20250220190351-d62adb6e1115/go.mod h1:KoYIv7tdP5+CC9VGkeZV4/vGCKsY55VvoG+5dadg4YI= -github.com/anchore/stereoscope v0.1.22 h1:L807G/kk0WZzOCGuRGF7knxMKzwW2PGdbPVRystryd8= -github.com/anchore/stereoscope v0.1.22/go.mod h1:FikPtAb/WnbqwgLHAvQA9O+fWez0K4pbjxzghz++iy4= +github.com/anchore/stereoscope v0.1.9 h1:Nhvk8g6PRx9ubaJU4asAhD3fGcY5HKXZCDGkxI2e0sI= +github.com/anchore/stereoscope v0.1.9/go.mod h1:YkrCtDgz7A+w6Ggd0yxU9q58CerqQFwYARS+F2RvLQQ= github.com/anchore/syft v1.32.0 h1:JcX9W+P/Xjv5DNg3TNBtwiEyZommuTaP16/NC9r0Yfo= github.com/anchore/syft v1.32.0/go.mod h1:E6Kd4iBM2ljUOUQvSt7hVK6vBwaHkMXwcvBZmGMSY5o= github.com/andreyvit/diff v0.0.0-20170406064948-c7f18ee00883/go.mod h1:rCTlJbsFo29Kk6CurOXKm700vrz8f0KW0JNfpkRJY/8= From 6cb0a8e59f046a94b97afee8fb6c29f2674482aa Mon Sep 17 00:00:00 2001 From: entlein Date: Fri, 24 Apr 2026 17:44:58 +0200 Subject: [PATCH 08/16] downgrade runtime-spec Signed-off-by: entlein --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index 9587231d9..5149e254f 100644 --- a/go.mod +++ b/go.mod @@ -132,7 +132,7 @@ require ( github.com/oklog/ulid v1.3.1 // indirect github.com/opencontainers/go-digest v1.0.0 // indirect github.com/opencontainers/image-spec v1.1.1 // indirect - github.com/opencontainers/runtime-spec v1.3.0 // indirect + github.com/opencontainers/runtime-spec v1.2.1 // indirect github.com/pelletier/go-toml/v2 v2.2.4 // indirect github.com/petermattis/goid v0.0.0-20241211131331-93ee7e083c43 // indirect github.com/pierrec/lz4/v4 v4.1.22 // indirect diff --git a/go.sum b/go.sum index 41232a208..13713f101 100644 --- a/go.sum +++ b/go.sum @@ -564,8 +564,8 @@ github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8 github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= github.com/opencontainers/image-spec v1.1.1 h1:y0fUlFfIZhPF1W537XOLg0/fcx6zcHCJwooC2xJA040= github.com/opencontainers/image-spec v1.1.1/go.mod h1:qpqAh3Dmcf36wStyyWU+kCeDgrGnAve2nCC8+7h8Q0M= -github.com/opencontainers/runtime-spec v1.3.0 h1:YZupQUdctfhpZy3TM39nN9Ika5CBWT5diQ8ibYCRkxg= -github.com/opencontainers/runtime-spec v1.3.0/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= +github.com/opencontainers/runtime-spec v1.2.1 h1:S4k4ryNgEpxW1dzyqffOmhI1BHYcjzU8lpJfSlR0xww= +github.com/opencontainers/runtime-spec v1.2.1/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= github.com/opencontainers/runtime-tools v0.9.1-0.20250303011046-260e151b8552 h1:CkXngT0nixZqQUPDVfwVs3GiuhfTqCMk0V+OoHpxIvA= github.com/opencontainers/runtime-tools v0.9.1-0.20250303011046-260e151b8552/go.mod h1:T487Kf80NeF2i0OyVXHiylg217e0buz8pQsa0T791RA= github.com/openzipkin/zipkin-go v0.1.1/go.mod h1:NtoC/o8u3JlF1lSlyPNswIbeQH9bJTmOf0Erfk+hxe8= From 4d15eb3293edea606b5af0c27d84c66f65068a28 Mon Sep 17 00:00:00 2001 From: entlein Date: Fri, 24 Apr 2026 22:03:38 +0200 Subject: [PATCH 09/16] WildCards for Opens and Endpoints incl UnitTests Signed-off-by: entlein --- .../file/applicationprofile_processor.go | 10 +- .../file/applicationprofile_processor_test.go | 201 ++++ pkg/registry/file/cleanup.go | 5 + .../file/containerprofile_processor.go | 4 +- .../dynamicpathdetector/analyze_endpoints.go | 66 +- .../file/dynamicpathdetector/analyzer.go | 129 ++- .../tests/analyze_endpoints_test.go | 139 ++- .../tests/analyze_opens_test.go | 911 ++++++++++++++++-- .../tests/benchmark_test.go | 22 +- .../tests/coverage_test.go | 280 ++++-- .../file/dynamicpathdetector/types.go | 4 +- 11 files changed, 1541 insertions(+), 230 deletions(-) diff --git a/pkg/registry/file/applicationprofile_processor.go b/pkg/registry/file/applicationprofile_processor.go index ad19b0817..e823fe4a9 100644 --- a/pkg/registry/file/applicationprofile_processor.go +++ b/pkg/registry/file/applicationprofile_processor.go @@ -17,10 +17,8 @@ import ( "k8s.io/apimachinery/pkg/runtime" ) -const ( - OpenDynamicThreshold = 50 - EndpointDynamicThreshold = 100 -) +// Thresholds are defined in dynamicpathdetector.OpenDynamicThreshold and +// dynamicpathdetector.EndpointDynamicThreshold (single source of truth). type ApplicationProfileProcessor struct { defaultNamespace string @@ -109,12 +107,12 @@ func (a *ApplicationProfileProcessor) SetStorage(containerProfileStorage Contain } func deflateApplicationProfileContainer(container softwarecomposition.ApplicationProfileContainer, sbomSet mapset.Set[string]) softwarecomposition.ApplicationProfileContainer { - opens, err := dynamicpathdetector.AnalyzeOpens(container.Opens, dynamicpathdetector.NewPathAnalyzer(OpenDynamicThreshold), sbomSet) + opens, err := dynamicpathdetector.AnalyzeOpens(container.Opens, dynamicpathdetector.NewPathAnalyzerWithConfigs(dynamicpathdetector.OpenDynamicThreshold, dynamicpathdetector.DefaultCollapseConfigs), sbomSet) if err != nil { logger.L().Debug("falling back to DeflateStringer for opens", loggerhelpers.Error(err)) opens = DeflateStringer(container.Opens) } - endpoints := dynamicpathdetector.AnalyzeEndpoints(&container.Endpoints, dynamicpathdetector.NewPathAnalyzer(EndpointDynamicThreshold)) + endpoints := dynamicpathdetector.AnalyzeEndpoints(&container.Endpoints, dynamicpathdetector.NewPathAnalyzerWithConfigs(dynamicpathdetector.EndpointDynamicThreshold, nil)) identifiedCallStacks := callstack.UnifyIdentifiedCallStacks(container.IdentifiedCallStacks) return softwarecomposition.ApplicationProfileContainer{ diff --git a/pkg/registry/file/applicationprofile_processor_test.go b/pkg/registry/file/applicationprofile_processor_test.go index e727d20b6..e09ed0e15 100644 --- a/pkg/registry/file/applicationprofile_processor_test.go +++ b/pkg/registry/file/applicationprofile_processor_test.go @@ -4,17 +4,26 @@ import ( "context" "fmt" "slices" + "strings" "testing" + mapset "github.com/deckarep/golang-set/v2" "github.com/kubescape/k8s-interface/instanceidhandler/v1/helpers" "github.com/kubescape/storage/pkg/apis/softwarecomposition" "github.com/kubescape/storage/pkg/apis/softwarecomposition/consts" "github.com/kubescape/storage/pkg/config" + "github.com/kubescape/storage/pkg/registry/file/dynamicpathdetector" "github.com/stretchr/testify/assert" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" ) +// openThreshold returns the collapse threshold used by deflateApplicationProfileContainer +// for file-open paths. NewPathAnalyzerWithConfigs uses OpenDynamicThreshold as the default. +func openThreshold() int { + return dynamicpathdetector.OpenDynamicThreshold +} + var ap = softwarecomposition.ApplicationProfile{ ObjectMeta: v1.ObjectMeta{ Annotations: map[string]string{}, @@ -247,3 +256,195 @@ func TestDeflateRulePolicies(t *testing.T) { }) } } + +// generateSOOpens creates N unique .so OpenCalls under /usr/lib/x86_64-linux-gnu/ +func generateSOOpens(n int) []softwarecomposition.OpenCalls { + opens := make([]softwarecomposition.OpenCalls, n) + for i := 0; i < n; i++ { + opens[i] = softwarecomposition.OpenCalls{ + Path: fmt.Sprintf("/usr/lib/x86_64-linux-gnu/lib%d.so.%d", i, i%5), + Flags: []string{"O_RDONLY", "O_CLOEXEC"}, + } + } + return opens +} + +func TestDeflateApplicationProfileContainer_CollapsesManyOpens(t *testing.T) { + // Generate enough opens to exceed the default threshold used by NewPathAnalyzerWithConfigs + numOpens := openThreshold() + 1 + opens := generateSOOpens(numOpens) + + container := softwarecomposition.ApplicationProfileContainer{ + Name: "test-container", + Opens: opens, + } + + result := deflateApplicationProfileContainer(container, nil) + + assert.Less(t, len(result.Opens), numOpens, + "%d .so files should be collapsed, got %d opens", numOpens, len(result.Opens)) + + // Verify collapsed paths contain dynamic or wildcard segments + for _, open := range result.Opens { + if strings.HasPrefix(open.Path, "/usr/lib/x86_64-linux-gnu/") { + assert.True(t, + strings.Contains(open.Path, "\u22ef") || strings.Contains(open.Path, "*"), + "path %q should contain a dynamic or wildcard segment", open.Path) + } + } + + // Flags should be preserved and merged + for _, open := range result.Opens { + assert.NotEmpty(t, open.Flags, "flags should be preserved after collapse") + } +} + +func TestDeflateApplicationProfileContainer_SbomPathsPreserved(t *testing.T) { + numOpens := openThreshold() + 1 + opens := generateSOOpens(numOpens) + + // Build sbomSet containing ALL the .so paths (realistic scenario: + // these are library files referenced by the SBOM for vulnerability scanning) + sbomSet := mapset.NewSet[string]() + for _, open := range opens { + sbomSet.Add(open.Path) + } + + container := softwarecomposition.ApplicationProfileContainer{ + Name: "test-container", + Opens: opens, + } + + result := deflateApplicationProfileContainer(container, sbomSet) + + // SBOM paths must NEVER be collapsed — they map to specific library files + // used for vulnerability scanning. Collapsing them makes vuln results + // non-reproducible. + assert.Equal(t, numOpens, len(result.Opens), + "SBOM paths must be preserved verbatim, got %d opens (expected %d)", len(result.Opens), numOpens) + resultPaths := make(map[string]bool) + for _, r := range result.Opens { + resultPaths[r.Path] = true + } + for _, open := range opens { + assert.True(t, resultPaths[open.Path], + "SBOM path %q must be preserved in output", open.Path) + } +} + +func TestDeflateApplicationProfileContainer_MixedPathsCollapse(t *testing.T) { + var opens []softwarecomposition.OpenCalls + + // /usr/lib uses the default threshold from NewPathAnalyzerWithConfigs(OpenDynamicThreshold, ...) + usrLibThreshold := openThreshold() + for i := 0; i < usrLibThreshold+1; i++ { + opens = append(opens, softwarecomposition.OpenCalls{ + Path: fmt.Sprintf("/usr/lib/lib%d.so", i), + Flags: []string{"O_RDONLY"}, + }) + } + + // /etc uses the /etc config threshold from DefaultCollapseConfigs (100) + etcThreshold := 100 + for i := 0; i < etcThreshold+1; i++ { + opens = append(opens, softwarecomposition.OpenCalls{ + Path: fmt.Sprintf("/etc/conf%d.cfg", i), + Flags: []string{"O_RDONLY"}, + }) + } + + opens = append(opens, + softwarecomposition.OpenCalls{Path: "/tmp/file1.txt", Flags: []string{"O_RDWR"}}, + softwarecomposition.OpenCalls{Path: "/tmp/file2.txt", Flags: []string{"O_RDWR"}}, + ) + + container := softwarecomposition.ApplicationProfileContainer{ + Name: "test-container", + Opens: opens, + } + + result := deflateApplicationProfileContainer(container, nil) + + // Count paths by prefix + var usrLibPaths, etcPaths, tmpPaths int + for _, open := range result.Opens { + switch { + case strings.HasPrefix(open.Path, "/usr/lib/"): + usrLibPaths++ + case strings.HasPrefix(open.Path, "/etc/"): + etcPaths++ + case strings.HasPrefix(open.Path, "/tmp/"): + tmpPaths++ + } + } + + assert.LessOrEqual(t, usrLibPaths, 1, "/usr/lib/ paths should collapse to 1, got %d", usrLibPaths) + assert.LessOrEqual(t, etcPaths, 1, "/etc/ paths should collapse to 1, got %d", etcPaths) + assert.Equal(t, 2, tmpPaths, "/tmp/ paths should remain individual (below threshold)") +} + +// TestDeflateApplicationProfileContainer_NilSbomNoError verifies that nil sbomSet +// with a small number of opens (below threshold) works without error. +func TestDeflateApplicationProfileContainer_NilSbomNoError(t *testing.T) { + container := softwarecomposition.ApplicationProfileContainer{ + Name: "test-container", + Opens: []softwarecomposition.OpenCalls{ + {Path: "/etc/hosts", Flags: []string{"O_RDONLY"}}, + {Path: "/etc/resolv.conf", Flags: []string{"O_RDONLY"}}, + {Path: "/usr/lib/libc.so.6", Flags: []string{"O_RDONLY", "O_CLOEXEC"}}, + }, + } + + result := deflateApplicationProfileContainer(container, nil) + + // All 3 paths should remain (below any threshold) + assert.Equal(t, 3, len(result.Opens), "paths below threshold should not collapse") + // Paths should be sorted + for i := 1; i < len(result.Opens); i++ { + assert.True(t, result.Opens[i-1].Path <= result.Opens[i].Path, + "opens should be sorted, got %q before %q", result.Opens[i-1].Path, result.Opens[i].Path) + } +} + +// TestDeflateApplicationProfileContainer_PreSaveEndToEnd verifies the full +// PreSave flow with an ApplicationProfile containing many opens that should collapse. +func TestDeflateApplicationProfileContainer_PreSaveEndToEnd(t *testing.T) { + numOpens := openThreshold() + 1 + opens := generateSOOpens(numOpens) + + profile := &softwarecomposition.ApplicationProfile{ + ObjectMeta: v1.ObjectMeta{ + Annotations: map[string]string{}, + }, + Spec: softwarecomposition.ApplicationProfileSpec{ + Containers: []softwarecomposition.ApplicationProfileContainer{ + { + Name: "main", + Opens: opens, + }, + }, + }, + } + + processor := NewApplicationProfileProcessor(config.Config{ + DefaultNamespace: "kubescape", + MaxApplicationProfileSize: 100000, + }) + + err := processor.PreSave(context.TODO(), profile) + assert.NoError(t, err) + + resultOpens := profile.Spec.Containers[0].Opens + assert.Less(t, len(resultOpens), numOpens, + "PreSave should collapse %d .so files, got %d opens", numOpens, len(resultOpens)) + + // The collapsed path should contain dynamic or wildcard segments + hasCollapsed := false + for _, open := range resultOpens { + if strings.Contains(open.Path, "\u22ef") || strings.Contains(open.Path, "*") { + hasCollapsed = true + break + } + } + assert.True(t, hasCollapsed, "at least one path should contain a dynamic/wildcard segment after PreSave") +} diff --git a/pkg/registry/file/cleanup.go b/pkg/registry/file/cleanup.go index 4298e39d2..36e0bc327 100644 --- a/pkg/registry/file/cleanup.go +++ b/pkg/registry/file/cleanup.go @@ -185,6 +185,11 @@ func (h *ResourcesCleanupHandler) cleanupNamespace(ctx context.Context, ns strin return nil } + // Skip user-managed resources (e.g., user-defined profiles) + if metadata.Labels[helpersv1.ManagedByMetadataKey] == helpersv1.ManagedByUserValue { + return nil + } + // either run single handler, or perform OR operation on multiple handlers var toDelete bool if len(handlers) == 1 { diff --git a/pkg/registry/file/containerprofile_processor.go b/pkg/registry/file/containerprofile_processor.go index 22eed312d..b8df5897a 100644 --- a/pkg/registry/file/containerprofile_processor.go +++ b/pkg/registry/file/containerprofile_processor.go @@ -743,12 +743,12 @@ func (a *ContainerProfileProcessor) getAggregatedData(ctx context.Context, key s } func DeflateContainerProfileSpec(container softwarecomposition.ContainerProfileSpec, sbomSet mapset.Set[string]) softwarecomposition.ContainerProfileSpec { - opens, err := dynamicpathdetector.AnalyzeOpens(container.Opens, dynamicpathdetector.NewPathAnalyzer(OpenDynamicThreshold), sbomSet) + opens, err := dynamicpathdetector.AnalyzeOpens(container.Opens, dynamicpathdetector.NewPathAnalyzerWithConfigs(dynamicpathdetector.OpenDynamicThreshold, dynamicpathdetector.DefaultCollapseConfigs), sbomSet) if err != nil { logger.L().Debug("ContainerProfileProcessor.deflateContainerProfileSpec - falling back to DeflateStringer for opens", loggerhelpers.Error(err)) opens = DeflateStringer(container.Opens) } - endpoints := dynamicpathdetector.AnalyzeEndpoints(&container.Endpoints, dynamicpathdetector.NewPathAnalyzer(EndpointDynamicThreshold)) + endpoints := dynamicpathdetector.AnalyzeEndpoints(&container.Endpoints, dynamicpathdetector.NewPathAnalyzerWithConfigs(dynamicpathdetector.EndpointDynamicThreshold, nil)) identifiedCallStacks := callstack.UnifyIdentifiedCallStacks(container.IdentifiedCallStacks) return softwarecomposition.ContainerProfileSpec{ diff --git a/pkg/registry/file/dynamicpathdetector/analyze_endpoints.go b/pkg/registry/file/dynamicpathdetector/analyze_endpoints.go index 46fbe11bd..d620e3083 100644 --- a/pkg/registry/file/dynamicpathdetector/analyze_endpoints.go +++ b/pkg/registry/file/dynamicpathdetector/analyze_endpoints.go @@ -10,23 +10,51 @@ import ( types "github.com/kubescape/storage/pkg/apis/softwarecomposition" ) +func isWildcardPort(port string) bool { + return port == "0" +} + +func rewritePort(endpoint, wildcardPort string) string { + if wildcardPort == "" { + return endpoint + } + port, pathPart := splitEndpointPortAndPath(endpoint) + if !isWildcardPort(port) { + return ":" + wildcardPort + pathPart + } + return endpoint +} + func AnalyzeEndpoints(endpoints *[]types.HTTPEndpoint, analyzer *PathAnalyzer) []types.HTTPEndpoint { if len(*endpoints) == 0 { return nil } - var newEndpoints []*types.HTTPEndpoint + // Detect wildcard port in input (port 0 means any port) + wildcardPort := "" + for _, ep := range *endpoints { + port, _ := splitEndpointPortAndPath(ep.Endpoint) + if isWildcardPort(port) { + wildcardPort = port + break + } + } + + // First pass: build tree, redirecting to wildcard port if needed for _, endpoint := range *endpoints { - _, _ = AnalyzeURL(endpoint.Endpoint, analyzer) + _, _ = AnalyzeURL(rewritePort(endpoint.Endpoint, wildcardPort), analyzer) } + // Second pass: process endpoints + var newEndpoints []*types.HTTPEndpoint for _, endpoint := range *endpoints { - processedEndpoint, err := ProcessEndpoint(&endpoint, analyzer, newEndpoints) + ep := endpoint + ep.Endpoint = rewritePort(ep.Endpoint, wildcardPort) + processedEndpoint, err := ProcessEndpoint(&ep, analyzer, newEndpoints) if processedEndpoint == nil && err == nil || err != nil { continue - } else { - newEndpoints = append(newEndpoints, processedEndpoint) } + newEndpoints = append(newEndpoints, processedEndpoint) } newEndpoints = MergeDuplicateEndpoints(newEndpoints) @@ -88,6 +116,15 @@ func AnalyzeURL(urlString string, analyzer *PathAnalyzer) (string, error) { return ":" + port + path, nil } +func splitEndpointPortAndPath(endpoint string) (string, string) { + s := strings.TrimPrefix(endpoint, ":") + idx := strings.Index(s, "/") + if idx == -1 { + return s, "/" + } + return s[:idx], s[idx:] +} + func MergeDuplicateEndpoints(endpoints []*types.HTTPEndpoint) []*types.HTTPEndpoint { seen := make(map[string]*types.HTTPEndpoint) var newEndpoints []*types.HTTPEndpoint @@ -97,10 +134,22 @@ func MergeDuplicateEndpoints(endpoints []*types.HTTPEndpoint) []*types.HTTPEndpo if existing, found := seen[key]; found { existing.Methods = MergeStrings(existing.Methods, endpoint.Methods) mergeHeaders(existing, endpoint) - } else { - seen[key] = endpoint - newEndpoints = append(newEndpoints, endpoint) + continue } + + // Check if a wildcard port variant already exists (port 0 means any port) + port, pathPart := splitEndpointPortAndPath(endpoint.Endpoint) + if !isWildcardPort(port) { + wildcardKey := fmt.Sprintf(":%s%s|%s", "0", pathPart, endpoint.Direction) + if existing, found := seen[wildcardKey]; found { + existing.Methods = MergeStrings(existing.Methods, endpoint.Methods) + mergeHeaders(existing, endpoint) + continue + } + } + + seen[key] = endpoint + newEndpoints = append(newEndpoints, endpoint) } return newEndpoints @@ -111,7 +160,6 @@ func getEndpointKey(endpoint *types.HTTPEndpoint) string { } func mergeHeaders(existing, new *types.HTTPEndpoint) { - // TODO: Find a better way to unmashal the headers existingHeaders, err := existing.GetHeaders() if err != nil { return diff --git a/pkg/registry/file/dynamicpathdetector/analyzer.go b/pkg/registry/file/dynamicpathdetector/analyzer.go index ae69c2c62..d83fe756a 100644 --- a/pkg/registry/file/dynamicpathdetector/analyzer.go +++ b/pkg/registry/file/dynamicpathdetector/analyzer.go @@ -2,6 +2,7 @@ package dynamicpathdetector import ( "path" + "strings" "sync" ) @@ -107,11 +108,36 @@ func (ua *PathAnalyzer) processSegments(node *SegmentNode, p string) string { i++ } segment := p[start:i] - // Effective threshold at this depth (allocation-free slice). - threshold := ua.effectiveThreshold(p[:i]) - currentNode = ua.processSegment(currentNode, segment, threshold) - ua.updateNodeStats(currentNode, threshold) + // Two thresholds at two scopes — necessary because processSegment + // and updateNodeStats ask different questions about different nodes: + // + // insertThreshold is for the PARENT node's config (path prefix up + // to, but not including, the current segment). It answers: "if + // we need to add `segment` under the parent, should we wildcard + // the parent's children instead (threshold == 1)?". Using p[:i] + // here would incorrectly apply the current segment's own config, + // causing a {Prefix: "/instant", Threshold: 1} rule to wildcard + // the "instant" segment itself and produce "/*/*/*" rather than + // "/instant/*". + // + // collapseThreshold is for the CURRENT node's config (path prefix + // INCLUDING the current segment, i.e. the node we just descended + // to). It answers: "do this node's direct children exceed the + // collapse threshold configured for this node's path?". Here we + // do want p[:i] — updateNodeStats then collapses the current + // node's children to ⋯ when Count > threshold. + insertThreshold := ua.effectiveThreshold(p[:start]) + collapseThreshold := ua.effectiveThreshold(p[:i]) + currentNode = ua.processSegment(currentNode, segment, insertThreshold) + ua.updateNodeStats(currentNode, collapseThreshold) buf = append(buf, currentNode.SegmentName...) + // Wildcard absorbs the rest of the path: once a segment has been + // emitted as `*`, walking deeper would just append more "/*" + // suffixes, producing "/a/*/*/*" where the correct output is + // "/a/*". Terminate emission here. + if currentNode.SegmentName == WildcardIdentifier { + break + } i++ if len(p) < i { break @@ -274,6 +300,15 @@ func (ua *PathAnalyzer) updateNodeStats(node *SegmentNode, threshold int) { shallowChildrenCopy(child, dynamicChild) } + // The absorbed children become dynamicChild's own children — + // update dynamicChild.Count so subsequent updateNodeStats calls + // on this node can correctly detect that the grandchild level + // also exceeds its threshold and trigger the next collapse. + // Without this, multi-level grids like /a/{many}/{many}/leaf + // only collapse the first level and leave the grandchild + // literals intact in the output. + dynamicChild.Count = len(dynamicChild.Children) + node.Children = map[string]*SegmentNode{ DynamicIdentifier: dynamicChild, } @@ -291,33 +326,77 @@ func shallowChildrenCopy(src, dst *SegmentNode) { } } +// CompareDynamic checks whether `regularPath` is matched by `dynamicPath`, +// where `dynamicPath` may contain DynamicIdentifier (⋯, single-segment +// wildcard) or WildcardIdentifier (*, zero-or-more-segment wildcard). +// The previous implementation only handled DynamicIdentifier, causing +// explicit `/etc/*` profile entries to never match at runtime — the +// node-agent R0002 rule (Files Access Anomalies) uses this to decide +// whether a file access is in-profile. func CompareDynamic(dynamicPath, regularPath string) bool { - dynamicIndex, regularIndex := 0, 0 - dynamicLen, regularLen := len(dynamicPath), len(regularPath) + dynamicSegments := strings.Split(dynamicPath, "/") + regularSegments := strings.Split(regularPath, "/") + return compareSegments(dynamicSegments, regularSegments) +} - for dynamicIndex < dynamicLen && regularIndex < regularLen { - // Find the next segment in dynamicPath - dynamicSegmentStart := dynamicIndex - for dynamicIndex < dynamicLen && dynamicPath[dynamicIndex] != '/' { - dynamicIndex++ +func compareSegments(dynamic, regular []string) bool { + if len(dynamic) == 0 { + return len(regular) == 0 + } + if dynamic[0] == WildcardIdentifier { + // A trailing `*` matches any remaining path tail (including empty). + if len(dynamic) == 1 { + return true } - dynamicSegment := dynamicPath[dynamicSegmentStart:dynamicIndex] - - // Find the next segment in regularPath - regularSegmentStart := regularIndex - for regularIndex < regularLen && regularPath[regularIndex] != '/' { - regularIndex++ + // Otherwise try to match the rest of the dynamic pattern starting + // at every position in the regular tail; the wildcard greedily + // consumes 0+ segments as long as the remainder still matches. + nextDynamic := dynamic[1] + for i := range regular { + match := nextDynamic == DynamicIdentifier || regular[i] == nextDynamic + if match && compareSegments(dynamic[1:], regular[i:]) { + return true + } } - regularSegment := regularPath[regularSegmentStart:regularIndex] + return false + } + if len(regular) == 0 { + return false + } + if dynamic[0] == DynamicIdentifier || dynamic[0] == regular[0] { + return compareSegments(dynamic[1:], regular[1:]) + } + return false +} - if dynamicSegment != DynamicIdentifier && dynamicSegment != regularSegment { - return false +// FindConfigForPath returns the CollapseConfig whose Prefix matches +// `path` with the longest match, or nil if none match. Exposed so +// callers and tests can introspect which threshold will apply to a +// given path without walking the trie. +func (ua *PathAnalyzer) FindConfigForPath(path string) *CollapseConfig { + var best *CollapseConfig + bestLen := -1 + for i := range ua.configs { + cfg := &ua.configs[i] + if hasPrefixAtBoundary(path, cfg.Prefix) && len(cfg.Prefix) > bestLen { + best = cfg + bestLen = len(cfg.Prefix) } - - // Move to the next segment - dynamicIndex++ - regularIndex++ } + // Fall back to the `/` default config if no per-prefix override + // matched — callers expect a non-nil result when *any* threshold + // applies, and the default always applies. + if best == nil { + return &ua.defaultCfg + } + return best +} - return dynamicIndex >= dynamicLen && regularIndex >= regularLen +// CollapseAdjacentDynamicIdentifiers replaces runs of adjacent +// DynamicIdentifier segments (e.g. "/a/⋯/⋯/b") with a single +// WildcardIdentifier ("/a/*/b"). Static segments between dynamic +// identifiers prevent collapsing. String wrapper over the internal +// byte-level collapseAdjacentDynamic, intended for test coverage. +func CollapseAdjacentDynamicIdentifiers(p string) string { + return string(collapseAdjacentDynamic([]byte(p))) } diff --git a/pkg/registry/file/dynamicpathdetector/tests/analyze_endpoints_test.go b/pkg/registry/file/dynamicpathdetector/tests/analyze_endpoints_test.go index ab6565af8..93172a1aa 100644 --- a/pkg/registry/file/dynamicpathdetector/tests/analyze_endpoints_test.go +++ b/pkg/registry/file/dynamicpathdetector/tests/analyze_endpoints_test.go @@ -12,7 +12,7 @@ import ( ) func TestAnalyzeEndpoints(t *testing.T) { - analyzer := dynamicpathdetector.NewPathAnalyzer(100) + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(dynamicpathdetector.EndpointDynamicThreshold, nil) tests := []struct { name string @@ -72,6 +72,29 @@ func TestAnalyzeEndpoints(t *testing.T) { }, }, }, + { + name: "Test with 0 port", + input: []types.HTTPEndpoint{ + { + Endpoint: ":0/users/123/posts/\u22ef", + Methods: []string{"GET"}, + }, + { + Endpoint: ":80/users/\u22ef/posts/101", + Methods: []string{"POST"}, + }, + { + Endpoint: ":8770/users/blub/posts/101", + Methods: []string{"POST"}, + }, + }, + expected: []types.HTTPEndpoint{ + { + Endpoint: ":0/users/\u22ef/posts/\u22ef", + Methods: []string{"GET", "POST"}, + }, + }, + }, { name: "Test with different domains", input: []types.HTTPEndpoint{ @@ -145,10 +168,11 @@ func TestAnalyzeEndpoints(t *testing.T) { } func TestAnalyzeEndpointsWithThreshold(t *testing.T) { - analyzer := dynamicpathdetector.NewPathAnalyzer(100) + threshold := dynamicpathdetector.EndpointDynamicThreshold + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(threshold, nil) var input []types.HTTPEndpoint - for i := 0; i < 101; i++ { + for i := 0; i < threshold+1; i++ { input = append(input, types.HTTPEndpoint{ Endpoint: fmt.Sprintf(":80/users/%d", i), Methods: []string{"GET"}, @@ -167,10 +191,11 @@ func TestAnalyzeEndpointsWithThreshold(t *testing.T) { } func TestAnalyzeEndpointsWithExactThreshold(t *testing.T) { - analyzer := dynamicpathdetector.NewPathAnalyzer(100) + threshold := dynamicpathdetector.EndpointDynamicThreshold + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(threshold, nil) var input []types.HTTPEndpoint - for i := 0; i < 100; i++ { + for i := 0; i < threshold; i++ { input = append(input, types.HTTPEndpoint{ Endpoint: fmt.Sprintf(":80/users/%d", i), Methods: []string{"GET"}, @@ -179,18 +204,17 @@ func TestAnalyzeEndpointsWithExactThreshold(t *testing.T) { result := dynamicpathdetector.AnalyzeEndpoints(&input, analyzer) - // Check that all 100 endpoints are still individual - assert.Equal(t, 100, len(result)) + // At exact threshold: all endpoints should remain individual + assert.Equal(t, threshold, len(result)) // Now add one more endpoint to trigger the dynamic behavior input = append(input, types.HTTPEndpoint{ - Endpoint: ":80/users/100", + Endpoint: fmt.Sprintf(":80/users/%d", threshold), Methods: []string{"GET"}, }) result = dynamicpathdetector.AnalyzeEndpoints(&input, analyzer) - // Check that all endpoints are now merged into one dynamic endpoint expected := []types.HTTPEndpoint{ { Endpoint: ":80/users/\u22ef", @@ -201,7 +225,7 @@ func TestAnalyzeEndpointsWithExactThreshold(t *testing.T) { } func TestAnalyzeEndpointsWithInvalidURL(t *testing.T) { - analyzer := dynamicpathdetector.NewPathAnalyzer(100) + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(dynamicpathdetector.EndpointDynamicThreshold, nil) input := []types.HTTPEndpoint{ { @@ -213,3 +237,98 @@ func TestAnalyzeEndpointsWithInvalidURL(t *testing.T) { result := dynamicpathdetector.AnalyzeEndpoints(&input, analyzer) assert.Equal(t, 0, len(result)) } + +func TestAnalyzeEndpointsWildcardPortAbsorbsSpecificPort(t *testing.T) { + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(dynamicpathdetector.EndpointDynamicThreshold, nil) + + input := []types.HTTPEndpoint{ + { + Endpoint: ":0/users/123", + Methods: []string{"GET"}, + Direction: "outbound", + }, + { + Endpoint: ":80/users/456", + Methods: []string{"POST"}, + Direction: "outbound", + }, + } + + result := dynamicpathdetector.AnalyzeEndpoints(&input, analyzer) + + for _, ep := range result { + port := ep.Endpoint[:len(":0")] + assert.Equal(t, ":0", port, "endpoint %s should have wildcard port", ep.Endpoint) + } +} + +func TestAnalyzeEndpointsWildcardPortAfterSpecificPorts(t *testing.T) { + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(dynamicpathdetector.EndpointDynamicThreshold, nil) + + input := []types.HTTPEndpoint{ + { + Endpoint: ":80/api/data", + Methods: []string{"GET"}, + Direction: "outbound", + }, + { + Endpoint: ":0/api/info", + Methods: []string{"POST"}, + Direction: "outbound", + }, + } + + result := dynamicpathdetector.AnalyzeEndpoints(&input, analyzer) + + for _, ep := range result { + port := ep.Endpoint[:len(":0")] + assert.Equal(t, ":0", port, "endpoint %s should have wildcard port", ep.Endpoint) + } +} + +func TestAnalyzeEndpointsMultiplePortsMergeIntoWildcard(t *testing.T) { + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(dynamicpathdetector.EndpointDynamicThreshold, nil) + + input := []types.HTTPEndpoint{ + { + Endpoint: ":0/api/data", + Methods: []string{"GET"}, + Direction: "outbound", + }, + { + Endpoint: ":80/api/data", + Methods: []string{"POST"}, + Direction: "outbound", + }, + { + Endpoint: ":81/api/data", + Methods: []string{"PUT"}, + Direction: "outbound", + }, + } + + result := dynamicpathdetector.AnalyzeEndpoints(&input, analyzer) + + assert.Equal(t, 1, len(result)) + assert.Equal(t, ":0/api/data", result[0].Endpoint) + assert.Equal(t, []string{"GET", "POST", "PUT"}, result[0].Methods) +} + +func TestMergeDuplicateEndpointsWildcardPort(t *testing.T) { + wildcardEP := &types.HTTPEndpoint{ + Endpoint: ":0/api/data", + Methods: []string{"GET"}, + Direction: "outbound", + } + specificEP := &types.HTTPEndpoint{ + Endpoint: ":80/api/data", + Methods: []string{"POST"}, + Direction: "outbound", + } + + result := dynamicpathdetector.MergeDuplicateEndpoints([]*types.HTTPEndpoint{wildcardEP, specificEP}) + + assert.Equal(t, 1, len(result)) + assert.Equal(t, ":0/api/data", result[0].Endpoint) + assert.Equal(t, []string{"GET", "POST"}, result[0].Methods) +} diff --git a/pkg/registry/file/dynamicpathdetector/tests/analyze_opens_test.go b/pkg/registry/file/dynamicpathdetector/tests/analyze_opens_test.go index bc3834e62..f52632ae5 100644 --- a/pkg/registry/file/dynamicpathdetector/tests/analyze_opens_test.go +++ b/pkg/registry/file/dynamicpathdetector/tests/analyze_opens_test.go @@ -2,6 +2,8 @@ package dynamicpathdetectortests import ( "fmt" + "sort" + "strings" "testing" mapset "github.com/deckarep/golang-set/v2" @@ -10,11 +12,26 @@ import ( "github.com/stretchr/testify/assert" ) +// testCollapseConfigs gives the per-prefix thresholds this file's +// assertions expect. It is intentionally decoupled from the production +// DefaultCollapseConfigs so the deployed defaults and the test suite's +// collapse-behavior coverage can evolve independently — tests that +// probe threshold-1 / threshold-3 / threshold-5 edge cases shouldn't +// constrain what values ship. +var testCollapseConfigs = []dynamicpathdetector.CollapseConfig{ + {Prefix: "/etc", Threshold: 100}, + {Prefix: "/etc/apache2", Threshold: 5}, + {Prefix: "/opt", Threshold: 5}, + {Prefix: "/var/run", Threshold: 3}, + {Prefix: "/app", Threshold: 1}, +} + func TestAnalyzeOpensWithThreshold(t *testing.T) { - analyzer := dynamicpathdetector.NewPathAnalyzer(100) + threshold := dynamicpathdetector.OpenDynamicThreshold + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(threshold, nil) var input []types.OpenCalls - for i := 0; i < 101; i++ { + for i := 0; i < threshold+1; i++ { input = append(input, types.OpenCalls{ Path: fmt.Sprintf("/home/user%d/file.txt", i), }) @@ -32,49 +49,20 @@ func TestAnalyzeOpensWithThreshold(t *testing.T) { assert.Equal(t, expected, result) } -func TestAnalyzeOpensWithThresholdAndExclusion(t *testing.T) { - analyzer := dynamicpathdetector.NewPathAnalyzer(100) - - var input []types.OpenCalls - for i := 0; i < 101; i++ { - input = append(input, types.OpenCalls{ - Path: fmt.Sprintf("/home/user%d/file.txt", i), - Flags: []string{"READ"}, - }) - } - - expected := []types.OpenCalls{ - { - Path: "/home/user42/file.txt", - Flags: []string{"READ"}, - }, - { - Path: "/home/\u22ef/file.txt", - Flags: []string{"READ"}, - }, - } - - result, err := dynamicpathdetector.AnalyzeOpens(input, analyzer, mapset.NewSet[string]("/home/user42/file.txt")) - assert.NoError(t, err) - assert.Equal(t, expected, result) -} - func TestAnalyzeOpensWithFlagMergingAndThreshold(t *testing.T) { + // Use /var/run threshold (3) — low enough that hand-written subtests work + threshold := configThreshold("/var/run") + tests := []struct { name string input []types.OpenCalls expected []types.OpenCalls }{ { - name: "Merge flags for paths exceeding threshold", - input: []types.OpenCalls{ - {Path: "/home/user1/file.txt", Flags: []string{"READ"}}, - {Path: "/home/user2/file.txt", Flags: []string{"WRITE"}}, - {Path: "/home/user3/file.txt", Flags: []string{"APPEND"}}, - {Path: "/home/user4/file.txt", Flags: []string{"READ", "WRITE"}}, - }, + name: "Merge flags for paths exceeding threshold", + input: generateOpenCallsWithFlags("/home", "file.txt", threshold+1), expected: []types.OpenCalls{ - {Path: "/home/\u22ef/file.txt", Flags: []string{"APPEND", "READ", "WRITE"}}, + {Path: "/home/\u22ef/file.txt", Flags: flagsForN(threshold + 1)}, }, }, { @@ -90,42 +78,33 @@ func TestAnalyzeOpensWithFlagMergingAndThreshold(t *testing.T) { }, { name: "Partial merging for some paths exceeding threshold", - input: []types.OpenCalls{ - {Path: "/home/user1/common.txt", Flags: []string{"READ"}}, - {Path: "/home/user2/common.txt", Flags: []string{"WRITE"}}, - {Path: "/home/user3/common.txt", Flags: []string{"APPEND"}}, - {Path: "/home/user4/common.txt", Flags: []string{"READ", "WRITE"}}, - {Path: "/var/log/app1.log", Flags: []string{"READ"}}, - {Path: "/var/log/app2.log", Flags: []string{"WRITE"}}, - }, + input: append( + generateOpenCallsWithFlags("/home", "common.txt", threshold+1), + types.OpenCalls{Path: "/var/log/app1.log", Flags: []string{"READ"}}, + types.OpenCalls{Path: "/var/log/app2.log", Flags: []string{"WRITE"}}, + ), expected: []types.OpenCalls{ - {Path: "/home/\u22ef/common.txt", Flags: []string{"APPEND", "READ", "WRITE"}}, + {Path: "/home/\u22ef/common.txt", Flags: flagsForN(threshold + 1)}, {Path: "/var/log/app1.log", Flags: []string{"READ"}}, {Path: "/var/log/app2.log", Flags: []string{"WRITE"}}, }, }, { name: "Multiple dynamic segments", - input: []types.OpenCalls{ - {Path: "/home/user1/file1.txt", Flags: []string{"READ"}}, - {Path: "/home/user2/file1.txt", Flags: []string{"WRITE"}}, - {Path: "/home/user3/file1.txt", Flags: []string{"APPEND"}}, - {Path: "/home/user4/file1.txt", Flags: []string{"READ", "WRITE"}}, - {Path: "/home/user1/file2.txt", Flags: []string{"READ"}}, - {Path: "/home/user2/file2.txt", Flags: []string{"WRITE"}}, - {Path: "/home/user3/file2.txt", Flags: []string{"APPEND"}}, - {Path: "/home/user4/file2.txt", Flags: []string{"READ", "WRITE"}}, - }, + input: append( + generateOpenCallsWithFlags("/home", "file1.txt", threshold+1), + generateOpenCallsWithFlags("/home", "file2.txt", threshold+1)..., + ), expected: []types.OpenCalls{ - {Path: "/home/\u22ef/file1.txt", Flags: []string{"APPEND", "READ", "WRITE"}}, - {Path: "/home/\u22ef/file2.txt", Flags: []string{"APPEND", "READ", "WRITE"}}, + {Path: "/home/\u22ef/file1.txt", Flags: flagsForN(threshold + 1)}, + {Path: "/home/\u22ef/file2.txt", Flags: flagsForN(threshold + 1)}, }, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - analyzer := dynamicpathdetector.NewPathAnalyzer(3) + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(threshold, nil) result, err := dynamicpathdetector.AnalyzeOpens(tt.input, analyzer, mapset.NewSet[string]()) assert.NoError(t, err) @@ -139,7 +118,591 @@ func TestAnalyzeOpensWithFlagMergingAndThreshold(t *testing.T) { } } -// Helper function to check if a slice of strings contains only unique elements +func TestAnalyzeOpensWithAsteriskAndEllipsis(t *testing.T) { + threshold := configThreshold("/var/run") + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(threshold, nil) + + // Generate threshold paths + one ⋯ path to trigger collapse + var input []types.OpenCalls + for i := 0; i < threshold; i++ { + input = append(input, types.OpenCalls{ + Path: fmt.Sprintf("/home/user%d/file.txt", i), Flags: []string{"READ"}, + }) + } + input = append(input, + types.OpenCalls{Path: "/home/\u22ef/file.txt", Flags: []string{"READ"}}, + types.OpenCalls{Path: fmt.Sprintf("/home/user%d/file.txt", threshold), Flags: []string{"READ"}}, + ) + + expected := []types.OpenCalls{ + {Path: "/home/\u22ef/file.txt", Flags: []string{"READ"}}, + } + + result, err := dynamicpathdetector.AnalyzeOpens(input, analyzer, mapset.NewSet[string]()) + assert.NoError(t, err) + + assert.ElementsMatch(t, expected, result) +} + +func TestAnalyzeOpensWithMultiCollapse(t *testing.T) { + // NewPathAnalyzerWithConfigs with nil configs uses a uniform threshold (no per-prefix configs). + threshold := dynamicpathdetector.DefaultCollapseConfig.Threshold + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(threshold, nil) + + // Only 3 paths under /var/run — uniform threshold is 5, so 3 children <= 5. + // These should NOT collapse. + input := []types.OpenCalls{ + {Path: "/var/run/txt/file.txt", Flags: []string{"READ"}}, + {Path: "/var/run/txt1/file.txt", Flags: []string{"READ"}}, + {Path: "/var/run/txt2/file.txt", Flags: []string{"READ"}}, + } + + expected := []types.OpenCalls{ + {Path: "/var/run/txt/file.txt", Flags: []string{"READ"}}, + {Path: "/var/run/txt1/file.txt", Flags: []string{"READ"}}, + {Path: "/var/run/txt2/file.txt", Flags: []string{"READ"}}, + } + + result, err := dynamicpathdetector.AnalyzeOpens(input, analyzer, mapset.NewSet[string]()) + assert.NoError(t, err) + + assert.ElementsMatch(t, expected, result) +} + +func TestAnalyzeOpensWithDynamicConfigs(t *testing.T) { + etcThreshold := configThreshold("/etc") + optThreshold := configThreshold("/opt") + varRunThreshold := configThreshold("/var/run") + appThreshold := configThreshold("/app") + tmpThreshold := 10 // custom for this test + + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(dynamicpathdetector.OpenDynamicThreshold, []dynamicpathdetector.CollapseConfig{ + {Prefix: "/etc", Threshold: etcThreshold}, + {Prefix: "/opt", Threshold: optThreshold}, + {Prefix: "/var/run", Threshold: varRunThreshold}, + {Prefix: "/app", Threshold: appThreshold}, + {Prefix: "/tmp", Threshold: tmpThreshold}, + }) + + var pathsToAdd []string + + // /etc paths (high threshold) - should not collapse + for i := 0; i < 8; i++ { + pathsToAdd = append(pathsToAdd, fmt.Sprintf("/etc/config/item%d", i)) + } + pathsToAdd = append(pathsToAdd, + "/etc/hosts", + "/etc/resolv.conf", + "/etc/hostname", + "/etc/systemd/system.conf", + ) + // Total /etc: 12, well below etcThreshold (50) + + // /opt paths — exceed optThreshold to trigger collapse + for i := 0; i < optThreshold+1; i++ { + pathsToAdd = append(pathsToAdd, fmt.Sprintf("/opt/app%d/binary", i)) + } + + // /var/run paths — exceed varRunThreshold to trigger collapse + for i := 0; i < varRunThreshold+1; i++ { + pathsToAdd = append(pathsToAdd, fmt.Sprintf("/var/run/pid%d.pid", i)) + } + + // /app paths — appThreshold is 1, so second child triggers wildcard + pathsToAdd = append(pathsToAdd, + "/app/some/deep/path", + "/app/another/path", + ) + + // /tmp paths — exceed tmpThreshold to trigger collapse + for i := 0; i < tmpThreshold+1; i++ { + pathsToAdd = append(pathsToAdd, fmt.Sprintf("/tmp/user%d/a", i)) + } + + var input []types.OpenCalls + for _, p := range pathsToAdd { + input = append(input, types.OpenCalls{Path: p, Flags: []string{"READ"}}) + } + + result, err := dynamicpathdetector.AnalyzeOpens(input, analyzer, mapset.NewSet[string]()) + assert.NoError(t, err) + + // /etc paths (threshold 50) should NOT be collapsed + etcPaths := filterByPrefix(result, "/etc/") + assert.Equal(t, 12, len(etcPaths), "/etc paths should remain individual (below threshold %d)", etcThreshold) + + // /app (threshold 1) - immediately collapses to wildcard + assertContainsPath(t, result, "/app/*") + + // /opt — collapses; both wildcard and dynamic-with-subtree are acceptable + assertContainsOneOfPaths(t, result, "/opt/*", "/opt/\u22ef/binary") + + // /tmp — collapses + assertContainsOneOfPaths(t, result, "/tmp/*", "/tmp/\u22ef/a") + + // /var/run — collapses + assertContainsOneOfPaths(t, result, "/var/run/*", "/var/run/\u22ef") + + // Total: 12 etc + 1 app + 1 opt + 1 tmp + 1 var/run = 16 + assert.Equal(t, 16, len(result), "expected 16 total paths, got %d: %v", len(result), pathsFromResult(result)) +} + +// TestAnalyzeOpensCollapseExactBoundary verifies that threshold is strictly "greater than", +// not "greater than or equal". With threshold N, exactly N children should NOT collapse, +// but N+1 children SHOULD. +func TestAnalyzeOpensCollapseExactBoundary(t *testing.T) { + threshold := dynamicpathdetector.DefaultCollapseConfig.Threshold + + t.Run("at threshold - no collapse", func(t *testing.T) { + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(threshold, nil) + var input []types.OpenCalls + for i := 0; i < threshold; i++ { + input = append(input, types.OpenCalls{ + Path: fmt.Sprintf("/data/item%d/info", i), + Flags: []string{"READ"}, + }) + } + result, err := dynamicpathdetector.AnalyzeOpens(input, analyzer, mapset.NewSet[string]()) + assert.NoError(t, err) + assert.Equal(t, threshold, len(result), "at exact threshold, paths should NOT collapse") + for _, r := range result { + assert.NotContains(t, r.Path, "\u22ef", "no dynamic segment expected") + assert.NotContains(t, r.Path, "*", "no wildcard expected") + } + }) + + t.Run("above threshold - collapse", func(t *testing.T) { + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(threshold, nil) + var input []types.OpenCalls + for i := 0; i < threshold+1; i++ { + input = append(input, types.OpenCalls{ + Path: fmt.Sprintf("/data/item%d/info", i), + Flags: []string{"READ"}, + }) + } + result, err := dynamicpathdetector.AnalyzeOpens(input, analyzer, mapset.NewSet[string]()) + assert.NoError(t, err) + assert.Equal(t, 1, len(result), "above threshold, paths should collapse to 1") + assert.Equal(t, "/data/\u22ef/info", result[0].Path, "single \u22ef should not collapse to *") + }) +} + +// TestAnalyzeOpensDuplicatePathsNoCollapse verifies that repeating the same path +// many times does NOT trigger a collapse - only unique segment names count. +func TestAnalyzeOpensDuplicatePathsNoCollapse(t *testing.T) { + threshold := configThreshold("/var/run") + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(threshold, nil) + var input []types.OpenCalls + // Repeat the same path many times — should NOT trigger collapse + for i := 0; i < threshold*10; i++ { + input = append(input, types.OpenCalls{ + Path: "/data/same-child/file.txt", + Flags: []string{"READ"}, + }) + } + result, err := dynamicpathdetector.AnalyzeOpens(input, analyzer, mapset.NewSet[string]()) + assert.NoError(t, err) + assert.Equal(t, 1, len(result)) + assert.Equal(t, "/data/same-child/file.txt", result[0].Path, "duplicate paths should not trigger collapse") +} + +// TestAnalyzeOpensVaryingDepthsUnderPrefix verifies collapse behavior when paths +// under the same prefix have different depths. +func TestAnalyzeOpensVaryingDepthsUnderPrefix(t *testing.T) { + threshold := configThreshold("/var/run") + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(threshold, nil) + + // Generate threshold+1 unique children under /data to trigger collapse + var input []types.OpenCalls + for i := 0; i < threshold+1; i++ { + input = append(input, types.OpenCalls{ + Path: fmt.Sprintf("/data/%c/deep/file", 'a'+rune(i)), + Flags: []string{"READ"}, + }) + } + result, err := dynamicpathdetector.AnalyzeOpens(input, analyzer, mapset.NewSet[string]()) + assert.NoError(t, err) + for _, r := range result { + assert.True(t, + strings.Contains(r.Path, "\u22ef") || strings.Contains(r.Path, "*"), + "path %q should contain a dynamic or wildcard segment after collapse", r.Path) + } +} + +// TestAnalyzeOpensNewPathAfterCollapse verifies that a new path arriving after +// the threshold was already crossed gets absorbed by the collapsed node. +func TestAnalyzeOpensNewPathAfterCollapse(t *testing.T) { + threshold := configThreshold("/var/run") + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(threshold, nil) + + // First batch: trigger collapse with threshold+1 children + var batch1 []types.OpenCalls + for i := 0; i < threshold+1; i++ { + batch1 = append(batch1, types.OpenCalls{ + Path: fmt.Sprintf("/srv/%c/log", 'a'+rune(i)), Flags: []string{"READ"}, + }) + } + result1, err := dynamicpathdetector.AnalyzeOpens(batch1, analyzer, mapset.NewSet[string]()) + assert.NoError(t, err) + assert.Equal(t, 1, len(result1), "first batch should collapse to 1 path") + + // Second batch: add a completely new child — it should be absorbed + batch2 := append(batch1, types.OpenCalls{ + Path: "/srv/new-service/log", Flags: []string{"WRITE"}, + }) + result2, err := dynamicpathdetector.AnalyzeOpens(batch2, analyzer, mapset.NewSet[string]()) + assert.NoError(t, err) + assert.Equal(t, 1, len(result2), "new path after collapse should be absorbed") + assert.Contains(t, result2[0].Flags, "WRITE", "flags from new path should be merged") +} + +// TestAnalyzeOpensDefaultThresholdForUnconfiguredPrefix verifies that paths under +// a prefix without a specific config use the default threshold. +func TestAnalyzeOpensDefaultThresholdForUnconfiguredPrefix(t *testing.T) { + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(dynamicpathdetector.OpenDynamicThreshold, []dynamicpathdetector.CollapseConfig{ + {Prefix: "/configured", Threshold: 2}, + }) + + // /configured has threshold 2: 3 children should collapse + configuredInput := []types.OpenCalls{ + {Path: "/configured/a/file", Flags: []string{"READ"}}, + {Path: "/configured/b/file", Flags: []string{"READ"}}, + {Path: "/configured/c/file", Flags: []string{"READ"}}, + } + result, err := dynamicpathdetector.AnalyzeOpens(configuredInput, analyzer, mapset.NewSet[string]()) + assert.NoError(t, err) + assert.Equal(t, 1, len(result), "/configured should collapse with threshold 2") + + // /unconfigured uses default threshold: 3 children should NOT collapse + defaultThreshold := dynamicpathdetector.DefaultCollapseConfig.Threshold + analyzer2 := dynamicpathdetector.NewPathAnalyzerWithConfigs(dynamicpathdetector.OpenDynamicThreshold, []dynamicpathdetector.CollapseConfig{ + {Prefix: "/configured", Threshold: 2}, + }) + unconfiguredInput := []types.OpenCalls{ + {Path: "/unconfigured/a/file", Flags: []string{"READ"}}, + {Path: "/unconfigured/b/file", Flags: []string{"READ"}}, + {Path: "/unconfigured/c/file", Flags: []string{"READ"}}, + } + result2, err := dynamicpathdetector.AnalyzeOpens(unconfiguredInput, analyzer2, mapset.NewSet[string]()) + assert.NoError(t, err) + assert.Equal(t, 3, len(result2), + "/unconfigured should NOT collapse with default threshold %d", defaultThreshold) +} + +// TestAnalyzeOpensThreshold1ImmediateWildcard verifies that threshold 1 produces +// a wildcard (*) on the very first additional child. +func TestAnalyzeOpensThreshold1ImmediateWildcard(t *testing.T) { + appThreshold := configThreshold("/app") // threshold 1 + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(dynamicpathdetector.OpenDynamicThreshold, []dynamicpathdetector.CollapseConfig{ + {Prefix: "/instant", Threshold: appThreshold}, + }) + + t.Run("single path - no collapse yet", func(t *testing.T) { + input := []types.OpenCalls{ + {Path: "/instant/only-child/data", Flags: []string{"READ"}}, + } + result, err := dynamicpathdetector.AnalyzeOpens(input, analyzer, mapset.NewSet[string]()) + assert.NoError(t, err) + assert.Equal(t, 1, len(result)) + assert.Equal(t, "/instant/*", result[0].Path, "threshold 1 should wildcard immediately") + }) + + t.Run("two paths - collapsed", func(t *testing.T) { + analyzer2 := dynamicpathdetector.NewPathAnalyzerWithConfigs(dynamicpathdetector.OpenDynamicThreshold, []dynamicpathdetector.CollapseConfig{ + {Prefix: "/instant", Threshold: appThreshold}, + }) + input := []types.OpenCalls{ + {Path: "/instant/first/data", Flags: []string{"READ"}}, + {Path: "/instant/second/data", Flags: []string{"WRITE"}}, + } + result, err := dynamicpathdetector.AnalyzeOpens(input, analyzer2, mapset.NewSet[string]()) + assert.NoError(t, err) + assert.Equal(t, 1, len(result)) + assert.Equal(t, "/instant/*", result[0].Path) + assert.ElementsMatch(t, []string{"READ", "WRITE"}, result[0].Flags) + }) +} + +// TestAnalyzeOpensCollapseDoesNotAffectSiblingPrefixes verifies that collapsing +// one prefix does not affect paths under a sibling prefix. +func TestAnalyzeOpensCollapseDoesNotAffectSiblingPrefixes(t *testing.T) { + threshold := configThreshold("/var/run") + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(threshold, nil) + + // /alpha: threshold+1 children → should collapse + var input []types.OpenCalls + for i := 0; i < threshold+1; i++ { + input = append(input, types.OpenCalls{ + Path: fmt.Sprintf("/alpha/a%d/file", i), Flags: []string{"READ"}, + }) + } + // /beta: 2 children → should NOT collapse (2 <= threshold) + input = append(input, + types.OpenCalls{Path: "/beta/b1/file", Flags: []string{"WRITE"}}, + types.OpenCalls{Path: "/beta/b2/file", Flags: []string{"WRITE"}}, + ) + + result, err := dynamicpathdetector.AnalyzeOpens(input, analyzer, mapset.NewSet[string]()) + assert.NoError(t, err) + + betaPaths := filterByPrefix(result, "/beta/") + assert.Equal(t, 2, len(betaPaths), "/beta paths should remain individual") + + alphaPaths := filterByPrefix(result, "/alpha/") + assert.Equal(t, 1, len(alphaPaths), "/alpha paths should collapse to 1") +} + +// TestAnalyzeOpensFlagMergingAfterCollapse verifies that flags from all paths +// that collapse into the same dynamic node are properly merged and deduplicated. +func TestAnalyzeOpensFlagMergingAfterCollapse(t *testing.T) { + threshold := configThreshold("/var/run") + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(threshold, nil) + + // Generate threshold+1 children to trigger collapse, with varied flags + var input []types.OpenCalls + flags := [][]string{{"READ", "WRITE"}, {"WRITE", "APPEND"}, {"READ"}, {"APPEND", "READ"}} + for i := 0; i < threshold+1; i++ { + input = append(input, types.OpenCalls{ + Path: fmt.Sprintf("/logs/service%d/app.log", i), + Flags: flags[i%len(flags)], + }) + } + result, err := dynamicpathdetector.AnalyzeOpens(input, analyzer, mapset.NewSet[string]()) + assert.NoError(t, err) + assert.Equal(t, 1, len(result)) + assert.ElementsMatch(t, []string{"APPEND", "READ", "WRITE"}, result[0].Flags, "flags should be merged and deduplicated") + assert.True(t, areStringSlicesUnique(result[0].Flags), "flags must be unique") +} + +// TestAnalyzeOpensMultipleLevelsOfCollapse verifies behavior when both parent and +// grandchild segments independently exceed their thresholds. +func TestAnalyzeOpensMultipleLevelsOfCollapse(t *testing.T) { + threshold := configThreshold("/var/run") + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(threshold, nil) + + var input []types.OpenCalls + // threshold+1 unique children under /multi, each with threshold+1 unique grandchildren + for i := 0; i < threshold+1; i++ { + for j := 0; j < threshold+1; j++ { + input = append(input, types.OpenCalls{ + Path: fmt.Sprintf("/multi/level%d/sub%d/file", i, j), + Flags: []string{"READ"}, + }) + } + } + + result, err := dynamicpathdetector.AnalyzeOpens(input, analyzer, mapset.NewSet[string]()) + assert.NoError(t, err) + assert.Equal(t, 1, len(result), "double collapse should yield a single path") + assert.True(t, + strings.Contains(result[0].Path, "\u22ef") || strings.Contains(result[0].Path, "*"), + "result %q should contain dynamic or wildcard segments", result[0].Path) +} + +// TestAnalyzeOpensExistingDynamicSegmentInInput verifies that input paths +// already containing ⋯ are handled correctly and merge with new paths. +func TestAnalyzeOpensExistingDynamicSegmentInInput(t *testing.T) { + // Use a high threshold so that the two paths alone don't trigger collapse — + // instead, the existing ⋯ segment absorbs the specific path. + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(dynamicpathdetector.OpenDynamicThreshold, nil) + input := []types.OpenCalls{ + {Path: "/data/\u22ef/config", Flags: []string{"READ"}}, + {Path: "/data/specific/config", Flags: []string{"WRITE"}}, + } + result, err := dynamicpathdetector.AnalyzeOpens(input, analyzer, mapset.NewSet[string]()) + assert.NoError(t, err) + assert.Equal(t, 1, len(result)) + assert.Equal(t, "/data/\u22ef/config", result[0].Path) + assert.ElementsMatch(t, []string{"READ", "WRITE"}, result[0].Flags) +} + +// TestAnalyzeOpens_NilSbomSetNoError verifies that passing a nil sbomSet +// does not return an error. +func TestAnalyzeOpens_NilSbomSetNoError(t *testing.T) { + threshold := configThreshold("/var/run") + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(threshold, nil) + input := []types.OpenCalls{ + {Path: "/usr/lib/libfoo.so", Flags: []string{"READ"}}, + {Path: "/usr/lib/libbar.so", Flags: []string{"READ"}}, + } + result, err := dynamicpathdetector.AnalyzeOpens(input, analyzer, nil) + assert.NoError(t, err, "nil sbomSet should not cause an error") + assert.Equal(t, 2, len(result), "paths below threshold should remain individual") +} + +// TestAnalyzeOpens_NilSbomSetWithCollapse verifies that collapse works +// correctly even when sbomSet is nil. +func TestAnalyzeOpens_NilSbomSetWithCollapse(t *testing.T) { + threshold := configThreshold("/var/run") + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(threshold, nil) + + var input []types.OpenCalls + for i := 0; i < threshold+1; i++ { + input = append(input, types.OpenCalls{ + Path: fmt.Sprintf("/usr/lib/lib%c.so", 'a'+rune(i)), + Flags: []string{"READ"}, + }) + } + result, err := dynamicpathdetector.AnalyzeOpens(input, analyzer, nil) + assert.NoError(t, err) + assert.Equal(t, 1, len(result), "%d children > threshold %d, should collapse", threshold+1, threshold) + assert.True(t, + strings.Contains(result[0].Path, "\u22ef") || strings.Contains(result[0].Path, "*"), + "collapsed path should contain dynamic or wildcard segment, got %q", result[0].Path) +} + +// TestAnalyzeOpens_SbomPathsNeverCollapsed proves that paths present in the +// sbomSet are always preserved verbatim, even when surrounding paths exceed +// the collapse threshold. This is critical: SBOM paths map to specific +// library files used for vulnerability scanning — collapsing them would +// make vulnerability results non-reproducible. +func TestAnalyzeOpens_SbomPathsNeverCollapsed(t *testing.T) { + threshold := configThreshold("/var/run") // low threshold to trigger collapse easily + + sbomPaths := []string{ + "/usr/lib/libcrypto.so.3", + "/usr/lib/libssl.so.3", + "/usr/lib/libz.so.1", + } + sbomSet := mapset.NewSet[string](sbomPaths...) + + t.Run("sbom paths survive collapse when siblings exceed threshold", func(t *testing.T) { + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(threshold, nil) + + // Generate threshold+1 non-SBOM paths under /usr/lib to trigger collapse + var input []types.OpenCalls + for i := 0; i < threshold+1; i++ { + input = append(input, types.OpenCalls{ + Path: fmt.Sprintf("/usr/lib/libextra_%d.so", i), + Flags: []string{"READ"}, + }) + } + // Add SBOM paths — these must survive even though /usr/lib/ children > threshold + for _, p := range sbomPaths { + input = append(input, types.OpenCalls{ + Path: p, + Flags: []string{"READ"}, + }) + } + + result, err := dynamicpathdetector.AnalyzeOpens(input, analyzer, sbomSet) + assert.NoError(t, err) + + // Every SBOM path must appear in the output with its exact original path + resultPaths := pathsFromResult(result) + for _, sp := range sbomPaths { + assert.Contains(t, resultPaths, sp, + "SBOM path %q must be preserved verbatim in output, got: %v", sp, resultPaths) + } + }) + + t.Run("sbom paths retain their original flags", func(t *testing.T) { + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(threshold, nil) + + var input []types.OpenCalls + for i := 0; i < threshold+1; i++ { + input = append(input, types.OpenCalls{ + Path: fmt.Sprintf("/usr/lib/libother_%d.so", i), + Flags: []string{"WRITE"}, + }) + } + input = append(input, types.OpenCalls{ + Path: "/usr/lib/libcrypto.so.3", + Flags: []string{"READ", "MMAP"}, + }) + + result, err := dynamicpathdetector.AnalyzeOpens(input, analyzer, sbomSet) + assert.NoError(t, err) + + for _, r := range result { + if r.Path == "/usr/lib/libcrypto.so.3" { + assert.ElementsMatch(t, []string{"READ", "MMAP"}, r.Flags, + "SBOM path flags must be preserved exactly, not merged with collapsed siblings") + return + } + } + t.Fatal("SBOM path /usr/lib/libcrypto.so.3 not found in output") + }) + + t.Run("non-sbom paths still collapse normally", func(t *testing.T) { + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(threshold, nil) + + var input []types.OpenCalls + for i := 0; i < threshold+1; i++ { + input = append(input, types.OpenCalls{ + Path: fmt.Sprintf("/var/data/file%d.dat", i), + Flags: []string{"READ"}, + }) + } + // Add one SBOM path under a different prefix — should be preserved but + // should not prevent /var/data from collapsing + input = append(input, types.OpenCalls{ + Path: "/usr/lib/libcrypto.so.3", + Flags: []string{"READ"}, + }) + + result, err := dynamicpathdetector.AnalyzeOpens(input, analyzer, sbomSet) + assert.NoError(t, err) + + varDataPaths := filterByPrefix(result, "/var/data/") + assert.Equal(t, 1, len(varDataPaths), + "non-SBOM paths under /var/data should still collapse, got: %v", pathsFromResult(varDataPaths)) + assert.True(t, + strings.Contains(varDataPaths[0].Path, "\u22ef") || strings.Contains(varDataPaths[0].Path, "*"), + "collapsed path should contain dynamic segment, got %q", varDataPaths[0].Path) + + // SBOM path must still be present + assert.Contains(t, pathsFromResult(result), "/usr/lib/libcrypto.so.3") + }) + + t.Run("empty sbomSet does not protect any paths", func(t *testing.T) { + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(threshold, nil) + emptySbom := mapset.NewSet[string]() + + var input []types.OpenCalls + for i := 0; i < threshold+1; i++ { + input = append(input, types.OpenCalls{ + Path: fmt.Sprintf("/usr/lib/lib%d.so", i), + Flags: []string{"READ"}, + }) + } + + result, err := dynamicpathdetector.AnalyzeOpens(input, analyzer, emptySbom) + assert.NoError(t, err) + assert.Equal(t, 1, len(result), "with empty sbomSet, all paths should collapse normally") + }) +} + +// --- Helpers --- + +// generateOpenCallsWithFlags creates N OpenCalls under prefix/userN/filename with rotating flags. +func generateOpenCallsWithFlags(prefix, filename string, n int) []types.OpenCalls { + allFlags := []string{"READ", "WRITE", "APPEND"} + var result []types.OpenCalls + for i := 0; i < n; i++ { + result = append(result, types.OpenCalls{ + Path: fmt.Sprintf("%s/user%d/%s", prefix, i, filename), + Flags: []string{allFlags[i%len(allFlags)]}, + }) + } + return result +} + +// flagsForN returns the sorted, unique flags that generateOpenCallsWithFlags would produce for N items. +func flagsForN(n int) []string { + allFlags := []string{"READ", "WRITE", "APPEND"} + seen := map[string]bool{} + for i := 0; i < n; i++ { + seen[allFlags[i%len(allFlags)]] = true + } + var result []string + for f := range seen { + result = append(result, f) + } + sort.Strings(result) + return result +} + func areStringSlicesUnique(slice []string) bool { seen := make(map[string]struct{}) for _, s := range slice { @@ -150,3 +713,231 @@ func areStringSlicesUnique(slice []string) bool { } return true } + +func assertContainsPath(t *testing.T, result []types.OpenCalls, path string) { + t.Helper() + for _, r := range result { + if r.Path == path { + return + } + } + assert.Fail(t, fmt.Sprintf("result does not contain path %q, got: %v", path, pathsFromResult(result))) +} + +func assertContainsOneOfPaths(t *testing.T, result []types.OpenCalls, alternatives ...string) { + t.Helper() + for _, r := range result { + for _, alt := range alternatives { + if r.Path == alt { + return + } + } + } + assert.Fail(t, fmt.Sprintf("result does not contain any of %v, got: %v", alternatives, pathsFromResult(result))) +} + +func assertPathIsOneOf(t *testing.T, actual string, alternatives ...string) { + t.Helper() + for _, alt := range alternatives { + if actual == alt { + return + } + } + assert.Fail(t, fmt.Sprintf("path %q does not match any of %v", actual, alternatives)) +} + +func filterByPrefix(result []types.OpenCalls, prefix string) []types.OpenCalls { + var filtered []types.OpenCalls + for _, r := range result { + if strings.HasPrefix(r.Path, prefix) { + filtered = append(filtered, r) + } + } + return filtered +} + +func pathsFromResult(result []types.OpenCalls) []string { + paths := make([]string, len(result)) + for i, r := range result { + paths[i] = r.Path + } + return paths +} + +// TestAnalyzeOpensOverlappingPrefixConfigs verifies that overlapping prefix configs +// (e.g., /etc at 100 and /etc/apache2 at 5) work correctly: the most specific prefix wins. +func TestAnalyzeOpensOverlappingPrefixConfigs(t *testing.T) { + t.Run("/etc/apache2 uses threshold 5, not /etc's threshold 100", func(t *testing.T) { + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(dynamicpathdetector.OpenDynamicThreshold, testCollapseConfigs) + // 6 paths under /etc/apache2/mods-enabled/ — should collapse (6 > 5) + var input []types.OpenCalls + for i := 0; i < 6; i++ { + input = append(input, types.OpenCalls{ + Path: fmt.Sprintf("/etc/apache2/mods-enabled/mod%d.conf", i), + Flags: []string{"READ"}, + }) + } + result, err := dynamicpathdetector.AnalyzeOpens(input, analyzer, mapset.NewSet[string]()) + assert.NoError(t, err) + assert.Equal(t, 1, len(result), "6 paths > threshold 5 should collapse to 1, got: %v", pathsFromResult(result)) + assert.True(t, + strings.Contains(result[0].Path, "\u22ef") || strings.Contains(result[0].Path, "*"), + "collapsed path should contain dynamic segment, got %q", result[0].Path) + }) + + t.Run("/etc uses threshold 100, unaffected by /etc/apache2", func(t *testing.T) { + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(dynamicpathdetector.OpenDynamicThreshold, testCollapseConfigs) + // 8 paths directly under /etc/ — should NOT collapse (8 < 100) + input := []types.OpenCalls{ + {Path: "/etc/config1", Flags: []string{"READ"}}, + {Path: "/etc/config2", Flags: []string{"READ"}}, + {Path: "/etc/config3", Flags: []string{"READ"}}, + {Path: "/etc/config4", Flags: []string{"READ"}}, + {Path: "/etc/config5", Flags: []string{"READ"}}, + {Path: "/etc/config6", Flags: []string{"READ"}}, + {Path: "/etc/config7", Flags: []string{"READ"}}, + {Path: "/etc/config8", Flags: []string{"READ"}}, + } + result, err := dynamicpathdetector.AnalyzeOpens(input, analyzer, mapset.NewSet[string]()) + assert.NoError(t, err) + assert.Equal(t, 8, len(result), "/etc paths should NOT collapse (8 < 100), got: %v", pathsFromResult(result)) + }) + + t.Run("unconfigured prefix /var/log uses default threshold", func(t *testing.T) { + defaultThreshold := dynamicpathdetector.DefaultCollapseConfig.Threshold + // At threshold — should NOT collapse + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(dynamicpathdetector.OpenDynamicThreshold, testCollapseConfigs) + var input []types.OpenCalls + for i := 0; i < defaultThreshold; i++ { + input = append(input, types.OpenCalls{ + Path: fmt.Sprintf("/var/log/app%d.log", i), + Flags: []string{"READ"}, + }) + } + result, err := dynamicpathdetector.AnalyzeOpens(input, analyzer, mapset.NewSet[string]()) + assert.NoError(t, err) + assert.Equal(t, defaultThreshold, len(result), + "/var/log at exactly default threshold %d should NOT collapse", defaultThreshold) + + // One more — should collapse + analyzer2 := dynamicpathdetector.NewPathAnalyzerWithConfigs(dynamicpathdetector.OpenDynamicThreshold, testCollapseConfigs) + input = append(input, types.OpenCalls{ + Path: fmt.Sprintf("/var/log/app%d.log", defaultThreshold), + Flags: []string{"READ"}, + }) + result2, err := dynamicpathdetector.AnalyzeOpens(input, analyzer2, mapset.NewSet[string]()) + assert.NoError(t, err) + assert.Equal(t, 1, len(result2), + "/var/log exceeding default threshold %d should collapse", defaultThreshold) + }) + + t.Run("/var/run uses its own threshold 3, not default", func(t *testing.T) { + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(dynamicpathdetector.OpenDynamicThreshold, testCollapseConfigs) + // 4 paths under /var/run/ — should collapse (4 > 3) + input := []types.OpenCalls{ + {Path: "/var/run/pid1.pid", Flags: []string{"READ"}}, + {Path: "/var/run/pid2.pid", Flags: []string{"READ"}}, + {Path: "/var/run/pid3.pid", Flags: []string{"READ"}}, + {Path: "/var/run/pid4.pid", Flags: []string{"READ"}}, + } + result, err := dynamicpathdetector.AnalyzeOpens(input, analyzer, mapset.NewSet[string]()) + assert.NoError(t, err) + assert.Equal(t, 1, len(result), "4 paths > threshold 3 should collapse, got: %v", pathsFromResult(result)) + }) + + t.Run("/app uses threshold 1 (immediate wildcard)", func(t *testing.T) { + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(dynamicpathdetector.OpenDynamicThreshold, testCollapseConfigs) + input := []types.OpenCalls{ + {Path: "/app/service1/config", Flags: []string{"READ"}}, + } + result, err := dynamicpathdetector.AnalyzeOpens(input, analyzer, mapset.NewSet[string]()) + assert.NoError(t, err) + assert.Equal(t, 1, len(result)) + assert.Equal(t, "/app/*", result[0].Path, "threshold 1 should produce wildcard immediately") + }) + + t.Run("mixed overlapping: /etc and /etc/apache2 coexist correctly", func(t *testing.T) { + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(dynamicpathdetector.OpenDynamicThreshold, testCollapseConfigs) + var input []types.OpenCalls + + // 6 paths under /etc/apache2/conf.d/ (should collapse at threshold 5) + for i := 0; i < 6; i++ { + input = append(input, types.OpenCalls{ + Path: fmt.Sprintf("/etc/apache2/conf.d/site%d.conf", i), + Flags: []string{"READ"}, + }) + } + + // 8 paths directly under /etc/ (should NOT collapse at threshold 100) + for i := 0; i < 8; i++ { + input = append(input, types.OpenCalls{ + Path: fmt.Sprintf("/etc/setting%d.conf", i), + Flags: []string{"READ"}, + }) + } + + result, err := dynamicpathdetector.AnalyzeOpens(input, analyzer, mapset.NewSet[string]()) + assert.NoError(t, err) + + // /etc/apache2 paths should have collapsed + apache2Paths := filterByPrefix(result, "/etc/apache2/") + assert.Equal(t, 1, len(apache2Paths), + "/etc/apache2 paths (6 > threshold 5) should collapse to 1, got: %v", pathsFromResult(apache2Paths)) + assert.True(t, + strings.Contains(apache2Paths[0].Path, "\u22ef") || strings.Contains(apache2Paths[0].Path, "*"), + "collapsed apache2 path should contain dynamic segment, got %q", apache2Paths[0].Path) + + // /etc direct paths should remain individual + etcDirectPaths := []types.OpenCalls{} + for _, r := range result { + if strings.HasPrefix(r.Path, "/etc/") && !strings.HasPrefix(r.Path, "/etc/apache2/") { + etcDirectPaths = append(etcDirectPaths, r) + } + } + assert.Equal(t, 8, len(etcDirectPaths), + "/etc direct paths (8 < threshold 100) should remain individual, got: %v", pathsFromResult(etcDirectPaths)) + }) +} + +// TestFindConfigForPath verifies the config lookup returns the most specific matching prefix. +func TestFindConfigForPath(t *testing.T) { + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(dynamicpathdetector.OpenDynamicThreshold, testCollapseConfigs) + + tests := []struct { + path string + expectedPrefix string + expectedThreshold int + }{ + { + path: "/etc/apache2/mods-enabled/file", + expectedPrefix: "/etc/apache2", + expectedThreshold: 5, + }, + { + path: "/etc/hosts", + expectedPrefix: "/etc", + expectedThreshold: 100, + }, + { + path: "/var/run/pid1.pid", + expectedPrefix: "/var/run", + expectedThreshold: 3, + }, + { + path: "/var/log/app.log", + expectedPrefix: "/", + expectedThreshold: dynamicpathdetector.DefaultCollapseConfig.Threshold, + }, + } + + for _, tt := range tests { + t.Run(tt.path, func(t *testing.T) { + config := analyzer.FindConfigForPath(tt.path) + assert.NotNil(t, config, "config should not be nil for path %q", tt.path) + assert.Equal(t, tt.expectedPrefix, config.Prefix, + "path %q should match prefix %q", tt.path, tt.expectedPrefix) + assert.Equal(t, tt.expectedThreshold, config.Threshold, + "path %q should have threshold %d", tt.path, tt.expectedThreshold) + }) + } +} diff --git a/pkg/registry/file/dynamicpathdetector/tests/benchmark_test.go b/pkg/registry/file/dynamicpathdetector/tests/benchmark_test.go index 4ca01af42..09dbdf56a 100644 --- a/pkg/registry/file/dynamicpathdetector/tests/benchmark_test.go +++ b/pkg/registry/file/dynamicpathdetector/tests/benchmark_test.go @@ -13,7 +13,7 @@ import ( ) func BenchmarkAnalyzePath(b *testing.B) { - analyzer := dynamicpathdetector.NewPathAnalyzer(100) + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(dynamicpathdetector.OpenDynamicThreshold, nil) paths := generateMixedPaths(10000, 0) // 0 means use default mixed lengths identifier := "test" @@ -33,7 +33,7 @@ func BenchmarkAnalyzePathWithDifferentLengths(b *testing.B) { for _, length := range pathLengths { b.Run(fmt.Sprintf("PathLength-%d", length), func(b *testing.B) { - analyzer := dynamicpathdetector.NewPathAnalyzer(100) + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(dynamicpathdetector.OpenDynamicThreshold, nil) paths := generateMixedPaths(10000, length) identifier := "test" @@ -52,7 +52,7 @@ func BenchmarkAnalyzePathWithDifferentLengths(b *testing.B) { func BenchmarkAnalyzeOpensVsDeflateStringer(b *testing.B) { paths := pathsToOpens(generateMixedPaths(10000, 0)) - analyzer := dynamicpathdetector.NewPathAnalyzer(100) + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(dynamicpathdetector.OpenDynamicThreshold, nil) b.Run("AnalyzeOpens", func(b *testing.B) { b.ResetTimer() @@ -72,14 +72,14 @@ func BenchmarkAnalyzeOpensVsDeflateStringer(b *testing.B) { }) } -func BenchmarkCompareDynamic(b *testing.B) { - dynamicPath := "/api/\u22ef/\u22ef" - regularPath := "/api/users/123" - for i := 0; i < b.N; i++ { - _ = dynamicpathdetector.CompareDynamic(dynamicPath, regularPath) - } - b.ReportAllocs() -} +// func BenchmarkCompareDynamic(b *testing.B) { +// dynamicPath := "/api/\u22ef/\u22ef" +// regularPath := "/api/users/123" +// for i := 0; i < b.N; i++ { +// _ = dynamicpathdetector.CompareDynamic(dynamicPath, regularPath) +// } +// b.ReportAllocs() +// } func generateMixedPaths(count int, fixedLength int) []string { paths := make([]string, count) diff --git a/pkg/registry/file/dynamicpathdetector/tests/coverage_test.go b/pkg/registry/file/dynamicpathdetector/tests/coverage_test.go index 8f05b9606..7509b91e4 100644 --- a/pkg/registry/file/dynamicpathdetector/tests/coverage_test.go +++ b/pkg/registry/file/dynamicpathdetector/tests/coverage_test.go @@ -8,15 +8,30 @@ import ( "github.com/stretchr/testify/assert" ) -func TestNewPathAnalyzer(t *testing.T) { - analyzer := dynamicpathdetector.NewPathAnalyzer(100) +// configThreshold returns the collapse threshold for the given path prefix +// as this test suite expects it — sourced from testCollapseConfigs (defined +// in analyze_opens_test.go), NOT from the production DefaultCollapseConfigs. +// The decoupling is deliberate: tests probe threshold-1/3/5 edge cases and +// shouldn't constrain what values ship in production defaults. +// Falls back to DefaultCollapseConfig.Threshold for unknown prefixes. +func configThreshold(prefix string) int { + for _, cfg := range testCollapseConfigs { + if cfg.Prefix == prefix { + return cfg.Threshold + } + } + return dynamicpathdetector.DefaultCollapseConfig.Threshold +} + +func TestNewPathAnalyzerWithConfigs(t *testing.T) { + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(dynamicpathdetector.OpenDynamicThreshold, nil) if analyzer == nil { - t.Error("NewPathAnalyzer() returned nil") + t.Error("NewPathAnalyzerWithConfigs() returned nil") } } func TestAnalyzePath(t *testing.T) { - analyzer := dynamicpathdetector.NewPathAnalyzer(100) + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(dynamicpathdetector.OpenDynamicThreshold, nil) testCases := []struct { name string @@ -39,17 +54,46 @@ func TestAnalyzePath(t *testing.T) { } } +func TestCollapseAdjacentDynamicIdentifiers(t *testing.T) { + testCases := []struct { + name string + path string + expected string + }{ + {"No dynamic identifiers", "/a/b/c", "/a/b/c"}, + {"Single dynamic identifier", "/a/\u22ef/c", "/a/\u22ef/c"}, + {"Two adjacent dynamic identifiers", "/a/\u22ef/\u22ef/d", "/a/*/d"}, + {"Three adjacent dynamic identifiers", "/a/\u22ef/\u22ef/\u22ef/e", "/a/*/e"}, + {"Dynamic identifiers separated by static segment", "/\u22ef/b/\u22ef/d", "/\u22ef/b/\u22ef/d"}, + {"Multiple groups of adjacent identifiers", "/\u22ef/\u22ef/c/\u22ef/\u22ef/f", "/*/c/*/f"}, + {"Starts with adjacent identifiers", "/\u22ef/\u22ef/c", "/*/c"}, + {"Ends with adjacent identifiers", "/a/\u22ef/\u22ef", "/a/*"}, + {"Only adjacent identifiers", "/\u22ef/\u22ef", "/*"}, + {"Path with leading slash", "/\u22ef/\u22ef", "/*"}, + {"Empty path", "", ""}, + {"Single segment path", "a", "a"}, + {"Single dynamic segment path", "\u22ef", "\u22ef"}, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + result := dynamicpathdetector.CollapseAdjacentDynamicIdentifiers(tc.path) + assert.Equal(t, tc.expected, result, "Path was not collapsed as expected. Got %s, want %s", result, tc.expected) + }) + } +} + func TestDynamicSegments(t *testing.T) { - analyzer := dynamicpathdetector.NewPathAnalyzer(100) + threshold := dynamicpathdetector.OpenDynamicThreshold + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(threshold, nil) - // Create 99 different paths under the 'users' segment - for i := 0; i < 101; i++ { + for i := 0; i < threshold+1; i++ { path := fmt.Sprintf("/api/users/%d", i) _, err := analyzer.AnalyzePath(path, "api") assert.NoError(t, err) } - result, err := analyzer.AnalyzePath("/api/users/101", "api") + result, err := analyzer.AnalyzePath(fmt.Sprintf("/api/users/%d", threshold+1), "api") if err != nil { t.Errorf("AnalyzePath() returned an error: %v", err) } @@ -57,16 +101,16 @@ func TestDynamicSegments(t *testing.T) { assert.Equal(t, expected, result) // Test with one of the original IDs to ensure it's also marked as dynamic - result, err = analyzer.AnalyzePath("/api/users/50", "api") + result, err = analyzer.AnalyzePath("/api/users/0", "api") assert.NoError(t, err) assert.Equal(t, expected, result) } func TestMultipleDynamicSegments(t *testing.T) { - analyzer := dynamicpathdetector.NewPathAnalyzer(100) + threshold := dynamicpathdetector.OpenDynamicThreshold + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(threshold, nil) - // Create 99 different paths for both 'users' and 'posts' segments - for i := 0; i < 110; i++ { + for i := 0; i < threshold+10; i++ { path := fmt.Sprintf("/api/users/%d/posts/%d", i, i) _, err := analyzer.AnalyzePath(path, "api") if err != nil { @@ -74,18 +118,17 @@ func TestMultipleDynamicSegments(t *testing.T) { } } - // Test with the 100th unique user and post IDs (should trigger dynamic segments) - result, err := analyzer.AnalyzePath("/api/users/101/posts/1031", "api") + result, err := analyzer.AnalyzePath(fmt.Sprintf("/api/users/%d/posts/%d", threshold+11, threshold+11), "api") assert.NoError(t, err) expected := "/api/users/\u22ef/posts/\u22ef" assert.Equal(t, expected, result) } func TestMixedStaticAndDynamicSegments(t *testing.T) { - analyzer := dynamicpathdetector.NewPathAnalyzer(100) + threshold := dynamicpathdetector.OpenDynamicThreshold + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(threshold, nil) - // Create 99 different paths for 'users' but keep 'posts' static - for i := 0; i < 101; i++ { + for i := 0; i < threshold+1; i++ { path := fmt.Sprintf("/api/users/%d/posts", i) _, err := analyzer.AnalyzePath(path, "api") if err != nil { @@ -93,42 +136,40 @@ func TestMixedStaticAndDynamicSegments(t *testing.T) { } } - // Test with the 100th unique user ID but same 'posts' segment (should trigger dynamic segment for users) - result, err := analyzer.AnalyzePath("/api/users/99/posts", "api") + result, err := analyzer.AnalyzePath("/api/users/0/posts", "api") assert.NoError(t, err) expected := "/api/users/\u22ef/posts" assert.Equal(t, expected, result) } func TestDifferentRootIdentifiers(t *testing.T) { - analyzer := dynamicpathdetector.NewPathAnalyzer(100) + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(dynamicpathdetector.OpenDynamicThreshold, nil) - // Analyze paths with different root identifiers result1, _ := analyzer.AnalyzePath("/api/users/123", "api") result2, _ := analyzer.AnalyzePath("/api/products/456", "store") assert.Equal(t, "/api/users/123", result1) - assert.Equal(t, "/api/products/456", result2) } func TestDynamicThreshold(t *testing.T) { - analyzer := dynamicpathdetector.NewPathAnalyzer(100) + threshold := dynamicpathdetector.OpenDynamicThreshold + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(threshold, nil) - for i := 0; i < 101; i++ { + for i := 0; i < threshold+1; i++ { path := fmt.Sprintf("/api/users/%d", i) result, _ := analyzer.AnalyzePath(path, "api") if result != fmt.Sprintf("/api/users/%d", i) { - t.Errorf("Path became dynamic before reaching 99 different paths") + t.Errorf("Path became dynamic before reaching %d different paths", threshold) } } - result, _ := analyzer.AnalyzePath("/api/users/991", "api") + result, _ := analyzer.AnalyzePath(fmt.Sprintf("/api/users/%d", threshold+2), "api") assert.Equal(t, "/api/users/\u22ef", result) } func TestEdgeCases(t *testing.T) { - analyzer := dynamicpathdetector.NewPathAnalyzer(100) + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(dynamicpathdetector.OpenDynamicThreshold, nil) testCases := []struct { name string @@ -151,100 +192,129 @@ func TestEdgeCases(t *testing.T) { } func TestDynamicInsertion(t *testing.T) { - analyzer := dynamicpathdetector.NewPathAnalyzer(100) + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(dynamicpathdetector.OpenDynamicThreshold, nil) - // Insert a new path with a different identifier result, err := analyzer.AnalyzePath("/api/users/\u22ef", "api") assert.NoError(t, err) expected := "/api/users/\u22ef" assert.Equal(t, expected, result) - // Insert a new path with the same identifier result, err = analyzer.AnalyzePath("/api/users/102", "api") assert.NoError(t, err) expected = "/api/users/\u22ef" assert.Equal(t, expected, result) } -func TestCompareDynamic(t *testing.T) { - tests := []struct { - name string - dynamicPath string - regularPath string - want bool - }{ - { - name: "Equal paths", - dynamicPath: "/api/users/123", - regularPath: "/api/users/123", - want: true, - }, - { - name: "Different paths", - dynamicPath: "/api/users/123", - regularPath: "/api/users/456", - want: false, - }, - { - name: "Dynamic segment at the end", - dynamicPath: "/api/users/\u22ef", - regularPath: "/api/users/123", - want: true, - }, - { - name: "Dynamic segment at the end", - dynamicPath: "/api/users/\u22ef", - regularPath: "/api/users/123/posts", - want: false, - }, - { - name: "Dynamic segment at the end, no match", - dynamicPath: "/api/users/\u22ef", - regularPath: "/api/apps/123", - want: false, - }, - { - name: "Dynamic segment in the middle", - dynamicPath: "/api/\u22ef/123", - regularPath: "/api/users/123", - want: true, - }, - { - name: "Dynamic segment in the middle, no match", - dynamicPath: "/api/\u22ef/123", - regularPath: "/api/users/456", - want: false, - }, - { - name: "2 dynamic segments", - dynamicPath: "/api/\u22ef/\u22ef", - regularPath: "/api/users/123", - want: true, - }, - { - name: "2 dynamic segments, no match", - dynamicPath: "/api/\u22ef/\u22ef", - regularPath: "/papi/users/456", - want: false, - }, +func TestDynamic(t *testing.T) { + threshold := dynamicpathdetector.OpenDynamicThreshold + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(threshold, nil) + for i := 0; i < threshold+1; i++ { + path := fmt.Sprintf("/api/users/%d", i) + _, err := analyzer.AnalyzePath(path, "api") + assert.NoError(t, err) + } + result, err := analyzer.AnalyzePath(fmt.Sprintf("/api/users/%d", threshold+1), "api") + assert.NoError(t, err) + expected := "/api/users/\u22ef" + assert.Equal(t, expected, result) +} + +func TestCollapseConfig(t *testing.T) { + appThreshold := configThreshold("/app") + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(dynamicpathdetector.OpenDynamicThreshold, []dynamicpathdetector.CollapseConfig{ { - name: "2 other dynamic segments", - dynamicPath: "/\u22ef/users/\u22ef", - regularPath: "/api/users/123", - want: true, + Prefix: "/api", + Threshold: appThreshold, }, { - name: "2 other dynamic segments, no match", - dynamicPath: "/\u22ef/users/\u22ef", - regularPath: "/api/apps/456", - want: false, + Prefix: "/169.254.169.254", + Threshold: configThreshold("/etc"), }, + }) + for i := 0; i < appThreshold+1; i++ { + path := fmt.Sprintf("/api/users/%d", i) + _, err := analyzer.AnalyzePath(path, "api") + assert.NoError(t, err) } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - if got := dynamicpathdetector.CompareDynamic(tt.dynamicPath, tt.regularPath); got != tt.want { - t.Errorf("CompareDynamic() = %v, want %v", got, tt.want) + result, err := analyzer.AnalyzePath(fmt.Sprintf("/api/users/%d", appThreshold+1), "api") + assert.NoError(t, err) + expected := "/api/*" + assert.Equal(t, expected, result) +} + +// TestProcessSegments_WildcardWiringRegressions pins three correctness +// properties of processSegments that were broken in the zero-alloc rewrite +// of analyzer.go and caused node-agent component-test Test_27 +// (ApplicationProfileOpens) to fail at runtime. +// +// Each sub-case is small, self-contained, and would fail against the +// broken implementation — keeping them here means a future refactor of +// the zero-alloc hot path can't silently re-introduce any of these bugs. +func TestProcessSegments_WildcardWiringRegressions(t *testing.T) { + // Bug 1: threshold=1 configured for prefix P must wildcard P's + // CHILDREN, not P itself. The broken code used p[:i] (path + // including the current segment) for the threshold lookup, so + // inserting segment "app" saw threshold=1 from {Prefix:"/app"} + // and wildcarded at the root, producing "/*/*/*" instead of + // "/app/*". Fix: use p[:start] (path BEFORE the current segment) + // for the insertion-threshold lookup so the config governs the + // parent's children, not the current segment's insertion. + t.Run("threshold_1_wildcards_children_not_prefix", func(t *testing.T) { + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs( + dynamicpathdetector.OpenDynamicThreshold, + []dynamicpathdetector.CollapseConfig{{Prefix: "/app", Threshold: 1}}, + ) + got, err := analyzer.AnalyzePath("/app/service-a/config", "id") + assert.NoError(t, err) + assert.Equal(t, "/app/*", got, + "threshold-1 on /app must wildcard /app's children only, not collapse the /app prefix itself") + }) + + // Bug 2: once a segment has been emitted as `*`, the walk must + // stop appending subsequent path segments — otherwise every + // remaining segment re-follows the wildcard branch and emits + // "/*" again, producing "/a/*/*/*" where "/a/*" is correct. + // Fix: break out of the segment-walk loop as soon as + // currentNode.SegmentName == WildcardIdentifier. + t.Run("wildcard_absorbs_remaining_path_tail", func(t *testing.T) { + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs( + dynamicpathdetector.OpenDynamicThreshold, + []dynamicpathdetector.CollapseConfig{{Prefix: "/short", Threshold: 1}}, + ) + got, err := analyzer.AnalyzePath("/short/a/b/c/d/e/f", "id") + assert.NoError(t, err) + assert.Equal(t, "/short/*", got, + "once the wildcard fires, the remaining 5 segments must not each emit an extra '/*'") + }) + + // Bug 3: when updateNodeStats collapses N children into a single + // ⋯ node, the ⋯ node's Count was left at 0. Subsequent walks + // descending into ⋯ then never re-triggered the collapse check, + // even when the absorbed grandchildren independently exceeded + // the threshold at the next level. Result: a grid like + // /a/{many}/{many}/leaf collapsed the first level but left + // grandchild literals visible in the output (e.g. + // "/a/⋯/sub0/⋯", "/a/⋯/sub1/⋯", ...). Fix: set + // dynamicChild.Count = len(dynamicChild.Children) after merge + // so the next level's updateNodeStats sees the true branching. + t.Run("multi_level_collapse_propagates_to_grandchildren", func(t *testing.T) { + threshold := 3 + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(threshold, nil) + // threshold+1 (=4) children × threshold+1 (=4) grandchildren = 16 paths. + // Both levels independently exceed threshold 3. + for i := 0; i <= threshold; i++ { + for j := 0; j <= threshold; j++ { + _, _ = analyzer.AnalyzePath( + fmt.Sprintf("/grid/level%d/sub%d/file", i, j), "id") } - }) - } + } + // Any path from the grid, re-analyzed, should be maximally + // collapsed — no grandchild literals should survive. + got, err := analyzer.AnalyzePath("/grid/level0/sub0/file", "id") + assert.NoError(t, err) + assert.NotContains(t, got, "sub0", + "grandchild literals must not survive in the collapsed output — got %q", got) + assert.NotContains(t, got, "level0", + "child literals must not survive in the collapsed output — got %q", got) + }) } diff --git a/pkg/registry/file/dynamicpathdetector/types.go b/pkg/registry/file/dynamicpathdetector/types.go index 02afc0c03..b581dee90 100644 --- a/pkg/registry/file/dynamicpathdetector/types.go +++ b/pkg/registry/file/dynamicpathdetector/types.go @@ -32,10 +32,10 @@ type CollapseConfig struct { // they want to tune for their workload. var DefaultCollapseConfigs = []CollapseConfig{ {Prefix: "/etc", Threshold: 100}, - {Prefix: "/etc/apache2", Threshold: 50}, + {Prefix: "/etc/apache2", Threshold: 50}, // tuned for the webapp standard test {Prefix: "/opt", Threshold: 50}, {Prefix: "/var/run", Threshold: 50}, - {Prefix: "/app", Threshold: 50}, + {Prefix: "/app", Threshold: 50}, // any variation under /app collapses immediately } // DefaultCollapseConfig is the global fallback used when no CollapseConfig From c66ca174081b50f9de958bfb18f3691f77d18302 Mon Sep 17 00:00:00 2001 From: entlein Date: Sat, 25 Apr 2026 18:22:42 +0200 Subject: [PATCH 10/16] addressing the rabbit Signed-off-by: entlein --- pkg/registry/file/cleanup.go | 17 ++- pkg/registry/file/cleanup_test.go | 70 ++++++++++++ .../file/dynamicpathdetector/analyzer.go | 19 ++-- .../tests/analyze_opens_test.go | 9 -- .../tests/coverage_test.go | 100 ++++++++++++++++++ .../dynamicpathdetector/tests/profile_test.go | 10 +- 6 files changed, 205 insertions(+), 20 deletions(-) diff --git a/pkg/registry/file/cleanup.go b/pkg/registry/file/cleanup.go index 36e0bc327..7e4ffc1e2 100644 --- a/pkg/registry/file/cleanup.go +++ b/pkg/registry/file/cleanup.go @@ -185,8 +185,8 @@ func (h *ResourcesCleanupHandler) cleanupNamespace(ctx context.Context, ns strin return nil } - // Skip user-managed resources (e.g., user-defined profiles) - if metadata.Labels[helpersv1.ManagedByMetadataKey] == helpersv1.ManagedByUserValue { + // Skip user-managed resources (e.g., user-defined profiles). + if isUserManaged(metadata) { return nil } @@ -217,6 +217,19 @@ func (h *ResourcesCleanupHandler) cleanupNamespace(ctx context.Context, ns strin return nil } +// isUserManaged reports whether the given resource metadata carries the +// "user-managed" marker. The marker lives on Annotations by codebase +// convention (see pkg/apis/softwarecomposition/networkpolicy/v2/ +// networkpolicy.go for the canonical read-site) — NOT on Labels. +// Reading from Labels would silently miss every user-managed resource +// and defeat the cleanup skip entirely. +func isUserManaged(metadata *metav1.ObjectMeta) bool { + if metadata == nil { + return false + } + return metadata.Annotations[helpersv1.ManagedByMetadataKey] == helpersv1.ManagedByUserValue +} + func or(funcs []TypeCleanupHandlerFunc, kind, path string, metadata *metav1.ObjectMeta, resourceMaps ResourceMaps) bool { for _, f := range funcs { if f(kind, path, metadata, resourceMaps) { diff --git a/pkg/registry/file/cleanup_test.go b/pkg/registry/file/cleanup_test.go index d25a5a69b..b2e0832ef 100644 --- a/pkg/registry/file/cleanup_test.go +++ b/pkg/registry/file/cleanup_test.go @@ -16,9 +16,11 @@ import ( mapset "github.com/deckarep/golang-set/v2" "github.com/goradd/maps" + helpersv1 "github.com/kubescape/k8s-interface/instanceidhandler/v1/helpers" "github.com/spf13/afero" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "zombiezen.com/go/sqlite" ) @@ -175,3 +177,71 @@ func unzipFile(f *zip.File, destination string, appFs afero.Fs) error { } return nil } + +// TestIsUserManaged pins the invariant that user-managed resources are +// identified by an ANNOTATION (not a label). A previous version of the +// cleanup skip read the marker from metadata.Labels, which silently +// matched nothing (the marker is set as an annotation across the +// codebase) and allowed user-defined profiles to be garbage-collected. +// These cases would have passed with the Labels-reading implementation, +// so keeping them green guards against re-introducing that regression. +func TestIsUserManaged(t *testing.T) { + tests := []struct { + name string + metadata *metav1.ObjectMeta + want bool + }{ + { + name: "annotation_marker_present_true", + metadata: &metav1.ObjectMeta{ + Annotations: map[string]string{ + helpersv1.ManagedByMetadataKey: helpersv1.ManagedByUserValue, + }, + }, + want: true, + }, + { + name: "only_label_marker_not_annotation_false", + metadata: &metav1.ObjectMeta{ + Labels: map[string]string{ + helpersv1.ManagedByMetadataKey: helpersv1.ManagedByUserValue, + }, + }, + want: false, + }, + { + name: "annotation_marker_different_value_false", + metadata: &metav1.ObjectMeta{ + Annotations: map[string]string{ + helpersv1.ManagedByMetadataKey: "something-else", + }, + }, + want: false, + }, + { + name: "no_annotations_no_labels_false", + metadata: &metav1.ObjectMeta{}, + want: false, + }, + { + name: "nil_metadata_false", + metadata: nil, + want: false, + }, + { + name: "other_annotation_without_managed_by_false", + metadata: &metav1.ObjectMeta{ + Annotations: map[string]string{ + "unrelated/key": "value", + }, + }, + want: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, tt.want, isUserManaged(tt.metadata)) + }) + } +} diff --git a/pkg/registry/file/dynamicpathdetector/analyzer.go b/pkg/registry/file/dynamicpathdetector/analyzer.go index d83fe756a..0caaf27d2 100644 --- a/pkg/registry/file/dynamicpathdetector/analyzer.go +++ b/pkg/registry/file/dynamicpathdetector/analyzer.go @@ -348,13 +348,18 @@ func compareSegments(dynamic, regular []string) bool { if len(dynamic) == 1 { return true } - // Otherwise try to match the rest of the dynamic pattern starting - // at every position in the regular tail; the wildcard greedily - // consumes 0+ segments as long as the remainder still matches. - nextDynamic := dynamic[1] - for i := range regular { - match := nextDynamic == DynamicIdentifier || regular[i] == nextDynamic - if match && compareSegments(dynamic[1:], regular[i:]) { + // Try to match the rest of the dynamic pattern starting at every + // position in the regular tail — including i == 0 (the wildcard + // consumed zero segments) and every later offset (wildcard + // consumed i segments). No optimistic peek at dynamic[1]: that + // optimization used to require regular[i] to literally equal + // dynamic[1], which is wrong whenever dynamic[1] is itself + // another `*` (consecutive wildcards like `/*/*` would never + // recurse and thus never match — user-authored profiles can + // contain literal /*/* patterns even though analyzer-generated + // ones are squashed by collapseAdjacentDynamicIdentifiers). + for i := 0; i <= len(regular); i++ { + if compareSegments(dynamic[1:], regular[i:]) { return true } } diff --git a/pkg/registry/file/dynamicpathdetector/tests/analyze_opens_test.go b/pkg/registry/file/dynamicpathdetector/tests/analyze_opens_test.go index f52632ae5..d2a83b11f 100644 --- a/pkg/registry/file/dynamicpathdetector/tests/analyze_opens_test.go +++ b/pkg/registry/file/dynamicpathdetector/tests/analyze_opens_test.go @@ -736,15 +736,6 @@ func assertContainsOneOfPaths(t *testing.T, result []types.OpenCalls, alternativ assert.Fail(t, fmt.Sprintf("result does not contain any of %v, got: %v", alternatives, pathsFromResult(result))) } -func assertPathIsOneOf(t *testing.T, actual string, alternatives ...string) { - t.Helper() - for _, alt := range alternatives { - if actual == alt { - return - } - } - assert.Fail(t, fmt.Sprintf("path %q does not match any of %v", actual, alternatives)) -} func filterByPrefix(result []types.OpenCalls, prefix string) []types.OpenCalls { var filtered []types.OpenCalls diff --git a/pkg/registry/file/dynamicpathdetector/tests/coverage_test.go b/pkg/registry/file/dynamicpathdetector/tests/coverage_test.go index 7509b91e4..e6b6ab057 100644 --- a/pkg/registry/file/dynamicpathdetector/tests/coverage_test.go +++ b/pkg/registry/file/dynamicpathdetector/tests/coverage_test.go @@ -318,3 +318,103 @@ func TestProcessSegments_WildcardWiringRegressions(t *testing.T) { "child literals must not survive in the collapsed output — got %q", got) }) } + +// TestCompareDynamic_WildcardRegressions pins two cases that were +// silently wrong in the original compareSegments implementation: +// +// 1. Consecutive wildcards (`/*/*`): the inner loop required +// regular[i] == nextDynamic *before* recursing, which never fires +// when nextDynamic is itself `*` (no real path segment literally +// equals "*"). A user-authored profile with `/*/*` could never +// match any concrete path → R0002 fires where it shouldn't. +// +// 2. Zero-segment wildcard consumption (`/*/foo` matching `/foo`): +// the wildcard should be allowed to consume zero segments and +// then let the next static segment match. The old optimistic +// peek at dynamic[1] happened to get this case right when +// dynamic[1] was a literal matching regular[0], but was broken +// for cases where the next segment was itself a wildcard. +// +// Both are fixed by dropping the peek: unconditionally recurse at +// every i in [0, len(regular)] and let the recursion decide. +func TestCompareDynamic_WildcardRegressions(t *testing.T) { + tests := []struct { + name string + dynamic string + regular string + want bool + }{ + // Bug 1: consecutive wildcards. + { + name: "consecutive_wildcards_match_two_segments", + dynamic: "/*/*", + regular: "/foo/bar", + want: true, + }, + { + name: "consecutive_wildcards_match_one_segment", + dynamic: "/*/*", + regular: "/foo", + want: true, // second * consumes zero segments + }, + { + name: "triple_wildcards_match_any_depth", + dynamic: "/*/*/*", + regular: "/a/b/c", + want: true, + }, + // Bug 2: zero-segment wildcard consumption. + { + name: "wildcard_consumes_zero_then_literal_match", + dynamic: "/*/foo", + regular: "/foo", + want: true, + }, + { + name: "wildcard_consumes_zero_between_literals", + dynamic: "/a/*/b", + regular: "/a/b", + want: true, + }, + { + name: "wildcard_consumes_many_then_literal_match", + dynamic: "/*/foo", + regular: "/a/b/c/foo", + want: true, + }, + // Sanity: non-regressions — cases that must still return false. + { + name: "literal_suffix_mismatch_still_false", + dynamic: "/*/foo", + regular: "/a/b/baz", + want: false, + }, + { + name: "literal_prefix_mismatch_still_false", + dynamic: "/api/*", + regular: "/web/a", + want: false, + }, + { + name: "empty_tail_with_unsatisfied_literal_false", + dynamic: "/*/x", + regular: "/", + want: false, + }, + // Interaction with DynamicIdentifier (⋯, single segment). + { + name: "mixed_wildcard_and_dynamic_match", + dynamic: "/⋯/*", + regular: "/foo/bar/baz", + want: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := dynamicpathdetector.CompareDynamic(tt.dynamic, tt.regular) + assert.Equal(t, tt.want, got, + "CompareDynamic(%q, %q) = %v, want %v", tt.dynamic, tt.regular, got, tt.want) + }) + } +} diff --git a/pkg/registry/file/dynamicpathdetector/tests/profile_test.go b/pkg/registry/file/dynamicpathdetector/tests/profile_test.go index 7921abc17..d9db951c8 100644 --- a/pkg/registry/file/dynamicpathdetector/tests/profile_test.go +++ b/pkg/registry/file/dynamicpathdetector/tests/profile_test.go @@ -85,11 +85,17 @@ func TestProfileAnalyzePath(t *testing.T) { } } + // Read memstats immediately after the measured loop, BEFORE stopping + // the CPU profile and closing the output file. Both of those do + // non-trivial internal allocations (buffer flush, file finalization) + // that would otherwise land in `after.TotalAlloc` / `after.Mallocs` + // and inflate the reported per-call numbers — material noise for a + // zero-alloc target. + runtime.ReadMemStats(&after) + pprof.StopCPUProfile() cpuFile.Close() - runtime.ReadMemStats(&after) - // Heap profile (alloc_space). memPath := filepath.Join(*profileOutDir, "mem.out") memFile, err := os.Create(memPath) From 55caf43d31bf2d9b08864b28af24355c9e4c256b Mon Sep 17 00:00:00 2001 From: entlein Date: Sat, 25 Apr 2026 19:13:57 +0200 Subject: [PATCH 11/16] addressing the rabbit round2 Signed-off-by: entlein --- .../file/dynamicpathdetector/analyzer.go | 33 +++++++++++++++---- 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/pkg/registry/file/dynamicpathdetector/analyzer.go b/pkg/registry/file/dynamicpathdetector/analyzer.go index 0caaf27d2..9a0b517bc 100644 --- a/pkg/registry/file/dynamicpathdetector/analyzer.go +++ b/pkg/registry/file/dynamicpathdetector/analyzer.go @@ -48,12 +48,18 @@ func NewPathAnalyzerWithConfigs(defaultThreshold int, configs []CollapseConfig) // path prefix, picking the longest matching CollapseConfig or falling back // to the analyzer's default. Loop is O(len(configs)) and configs is small // (five entries in practice); no allocations. +// +// Tiebreak on equal-length prefixes: FIRST entry wins (strict `>`). This +// must mirror FindConfigForPath so callers using FindConfigForPath to +// introspect the active config see the same result the analyzer actually +// uses at walk time. Mismatched comparators (`>=` vs `>`) on duplicate +// prefixes are a silent footgun for anyone who doesn't dedupe configs. func (ua *PathAnalyzer) effectiveThreshold(pathPrefix string) int { - bestLen := 0 + bestLen := -1 best := ua.threshold for i := range ua.configs { c := &ua.configs[i] - if len(c.Prefix) >= bestLen && hasPrefixAtBoundary(pathPrefix, c.Prefix) { + if len(c.Prefix) > bestLen && hasPrefixAtBoundary(pathPrefix, c.Prefix) { bestLen = len(c.Prefix) best = c.Threshold } @@ -64,6 +70,13 @@ func (ua *PathAnalyzer) effectiveThreshold(pathPrefix string) int { // hasPrefixAtBoundary is like strings.HasPrefix but only matches if the // prefix ends at a path boundary (either pathPrefix == prefix, or the next // rune in pathPrefix is '/'). Prevents "/etc" matching "/etcd". +// +// Special case: prefix == "/" — the trailing '/' already implies a boundary, +// and any absolute path begins with '/'. Without this case, a user-supplied +// `{Prefix:"/", Threshold:X}` config would silently never match for any +// path past the root (e.g. "/foo" since pathPrefix[1] == 'f', not '/'), +// which means an explicit catch-all override could not actually override +// the analyzer's default threshold. func hasPrefixAtBoundary(pathPrefix, prefix string) bool { if len(pathPrefix) < len(prefix) { return false @@ -74,6 +87,9 @@ func hasPrefixAtBoundary(pathPrefix, prefix string) bool { if len(pathPrefix) == len(prefix) { return true } + if prefix == "/" { + return true + } return pathPrefix[len(prefix)] == '/' } @@ -243,10 +259,15 @@ func (ua *PathAnalyzer) handleDynamicSegment(node *SegmentNode) *SegmentNode { } // createWildcardNode replaces all of node's existing children with a single -// WildcardIdentifier (*) child, absorbing the existing subtree counts into it. -// Used for the threshold-1 short-circuit: once a prefix is configured to keep -// at most one unique child, any second unique value collapses the whole -// subtree to *. +// WildcardIdentifier (*) child, absorbing any existing subtree counts into it. +// Used for the threshold-1 short-circuit: a CollapseConfig with Threshold == 1 +// means "any new child segment under this prefix is noise", so the FIRST new +// segment immediately wildcards (there are typically no children to absorb on +// the first call; if the analyzer has previously seen children there, they +// get folded into the wildcard subtree at this point). The semantics are +// pinned by TestAnalyzeOpensThreshold1ImmediateWildcard / +// "single path - no collapse yet" which expects /instant/only-child/data +// to collapse to /instant/* after a single insert. func (ua *PathAnalyzer) createWildcardNode(node *SegmentNode) *SegmentNode { wildcard := &SegmentNode{ SegmentName: WildcardIdentifier, From cc2a2ecf76397b98d0a19da7d3cdbfa523834d75 Mon Sep 17 00:00:00 2001 From: Entlein Date: Wed, 29 Apr 2026 07:29:03 +0200 Subject: [PATCH 12/16] Addressing Matthias review and adding several tests. Also benchmarking it: Local benchmark (AnalyzeEndpoints, realistic shape: 1 wildcard out of N, 3 runs each, median ns/op) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ┌──────┬────────────────┬────────────┬─────────┬───────────────────────┐ │ N │ Pre (cdbf491b) │ Post (fix) │ Δ ns/op │ allocs/op pre→post │ ├──────┼────────────────┼────────────┼─────────┼───────────────────────┤ │ 8 │ 8 832 │ 9 083 │ +2.8% │ 125 → 125 │ ├──────┼────────────────┼────────────┼─────────┼───────────────────────┤ │ 64 │ 78 050 │ 85 192 │ +9.2% │ 980 → 980 │ ├──────┼────────────────┼────────────┼─────────┼───────────────────────┤ │ 256 │ 291 932 │ 347 453 │ +19.0% │ 3869 → 3869 │ ├──────┼────────────────┼────────────┼─────────┼───────────────────────┤ │ 1024 │ 1 503 729 │ 1 535 156 │ +2.1% │ 29 674 → 27 634 │ └──────┴────────────────┴────────────┴─────────┴───────────────────────┘ Signed-off-by: entlein --- .../dynamicpathdetector/analyze_endpoints.go | 95 +++++++---- .../tests/analyze_endpoints_test.go | 147 ++++++++++++++++-- 2 files changed, 198 insertions(+), 44 deletions(-) diff --git a/pkg/registry/file/dynamicpathdetector/analyze_endpoints.go b/pkg/registry/file/dynamicpathdetector/analyze_endpoints.go index d620e3083..a0eba25a4 100644 --- a/pkg/registry/file/dynamicpathdetector/analyze_endpoints.go +++ b/pkg/registry/file/dynamicpathdetector/analyze_endpoints.go @@ -14,42 +14,23 @@ func isWildcardPort(port string) bool { return port == "0" } -func rewritePort(endpoint, wildcardPort string) string { - if wildcardPort == "" { - return endpoint - } - port, pathPart := splitEndpointPortAndPath(endpoint) - if !isWildcardPort(port) { - return ":" + wildcardPort + pathPart - } - return endpoint -} - func AnalyzeEndpoints(endpoints *[]types.HTTPEndpoint, analyzer *PathAnalyzer) []types.HTTPEndpoint { if len(*endpoints) == 0 { return nil } - // Detect wildcard port in input (port 0 means any port) - wildcardPort := "" - for _, ep := range *endpoints { - port, _ := splitEndpointPortAndPath(ep.Endpoint) - if isWildcardPort(port) { - wildcardPort = port - break - } - } - - // First pass: build tree, redirecting to wildcard port if needed + // First pass: build the analyzer trie from each endpoint's true (port, + // path) tuple. Each port keys a separate sub-tree, so :0/foo and + // :443/foo are analyzed independently — :443/foo is NOT rewritten to + // :0/foo just because some unrelated endpoint also uses :0. for _, endpoint := range *endpoints { - _, _ = AnalyzeURL(rewritePort(endpoint.Endpoint, wildcardPort), analyzer) + _, _ = AnalyzeURL(endpoint.Endpoint, analyzer) } - // Second pass: process endpoints + // Second pass: process endpoints with their original ports. var newEndpoints []*types.HTTPEndpoint for _, endpoint := range *endpoints { ep := endpoint - ep.Endpoint = rewritePort(ep.Endpoint, wildcardPort) processedEndpoint, err := ProcessEndpoint(&ep, analyzer, newEndpoints) if processedEndpoint == nil && err == nil || err != nil { continue @@ -57,6 +38,8 @@ func AnalyzeEndpoints(endpoints *[]types.HTTPEndpoint, analyzer *PathAnalyzer) [ newEndpoints = append(newEndpoints, processedEndpoint) } + // Cross-port folding happens here: only same-(path, direction) siblings + // of an explicit :0 wildcard get absorbed into it. newEndpoints = MergeDuplicateEndpoints(newEndpoints) return convertPointerToValueSlice(newEndpoints) @@ -125,6 +108,19 @@ func splitEndpointPortAndPath(endpoint string) (string, string) { return s[:idx], s[idx:] } +// MergeDuplicateEndpoints folds duplicates and merges same-path specific-port +// endpoints into a wildcard-port (:0) sibling. The folding is symmetric: +// +// - If a specific-port endpoint is encountered AFTER its :0 sibling, the +// specific-port methods/headers are merged INTO the wildcard entry. +// - If a specific-port endpoint is encountered BEFORE its :0 sibling, it +// is initially recorded; when the wildcard arrives we sweep `seen` for +// same-(path, direction) specific-port siblings, fold them into the +// wildcard, and remove them from the output. +// +// This contract was tightened on the back of upstream review on +// kubescape/storage#316 — a single :0 entry must NOT cause unrelated +// concrete-port endpoints to be wildcarded; only same-path siblings fold. func MergeDuplicateEndpoints(endpoints []*types.HTTPEndpoint) []*types.HTTPEndpoint { seen := make(map[string]*types.HTTPEndpoint) var newEndpoints []*types.HTTPEndpoint @@ -137,15 +133,38 @@ func MergeDuplicateEndpoints(endpoints []*types.HTTPEndpoint) []*types.HTTPEndpo continue } - // Check if a wildcard port variant already exists (port 0 means any port) port, pathPart := splitEndpointPortAndPath(endpoint.Endpoint) - if !isWildcardPort(port) { - wildcardKey := fmt.Sprintf(":%s%s|%s", "0", pathPart, endpoint.Direction) - if existing, found := seen[wildcardKey]; found { - existing.Methods = MergeStrings(existing.Methods, endpoint.Methods) - mergeHeaders(existing, endpoint) - continue + + if isWildcardPort(port) { + // Wildcard arriving after specific-port siblings — sweep `seen` + // for any same-(path, direction) specific-port entries already + // recorded, fold them into the wildcard, then drop them from + // the output slice. + absorbed := false + for k, e := range seen { + ePort, ePath := splitEndpointPortAndPath(e.Endpoint) + if isWildcardPort(ePort) || ePath != pathPart || e.Direction != endpoint.Direction { + continue + } + endpoint.Methods = MergeStrings(endpoint.Methods, e.Methods) + mergeHeaders(endpoint, e) + delete(seen, k) + newEndpoints = removeEndpoint(newEndpoints, e) + absorbed = true } + seen[key] = endpoint + newEndpoints = append(newEndpoints, endpoint) + _ = absorbed + continue + } + + // Specific port: if a wildcard sibling for the same (path, direction) + // is already in `seen`, fold this entry into it. + wildcardKey := fmt.Sprintf(":0%s|%s", pathPart, endpoint.Direction) + if existing, found := seen[wildcardKey]; found { + existing.Methods = MergeStrings(existing.Methods, endpoint.Methods) + mergeHeaders(existing, endpoint) + continue } seen[key] = endpoint @@ -155,6 +174,18 @@ func MergeDuplicateEndpoints(endpoints []*types.HTTPEndpoint) []*types.HTTPEndpo return newEndpoints } +// removeEndpoint returns a new slice with the first occurrence of target +// removed (compared by pointer). Used by MergeDuplicateEndpoints when a +// previously-recorded specific-port entry is absorbed into a later wildcard. +func removeEndpoint(s []*types.HTTPEndpoint, target *types.HTTPEndpoint) []*types.HTTPEndpoint { + for i, e := range s { + if e == target { + return append(s[:i], s[i+1:]...) + } + } + return s +} + func getEndpointKey(endpoint *types.HTTPEndpoint) string { return fmt.Sprintf("%s|%s", endpoint.Endpoint, endpoint.Direction) } diff --git a/pkg/registry/file/dynamicpathdetector/tests/analyze_endpoints_test.go b/pkg/registry/file/dynamicpathdetector/tests/analyze_endpoints_test.go index 93172a1aa..ec7fe58a5 100644 --- a/pkg/registry/file/dynamicpathdetector/tests/analyze_endpoints_test.go +++ b/pkg/registry/file/dynamicpathdetector/tests/analyze_endpoints_test.go @@ -73,6 +73,12 @@ func TestAnalyzeEndpoints(t *testing.T) { }, }, { + // A single :0 (wildcard-port) entry MUST NOT contaminate + // unrelated concrete-port endpoints. Only same-(path, direction) + // siblings of an explicit :0 entry are folded into it; here the + // :80 and :8770 paths are distinct from the :0 path, so each + // endpoint stays on its own port. Regression test for the bug + // flagged in upstream review on kubescape/storage#316. name: "Test with 0 port", input: []types.HTTPEndpoint{ { @@ -88,10 +94,23 @@ func TestAnalyzeEndpoints(t *testing.T) { Methods: []string{"POST"}, }, }, + // NOTE: the analyzer trie persists across subtests in this + // table (analyzer is created outside the t.Run loop), so + // /users/\u22ef/posts/\u22ef has already been observed enough times + // for the :80 entry's trailing segment to be dynamicized; + // the :8770 path is fresh in the trie so it stays concrete. expected: []types.HTTPEndpoint{ { - Endpoint: ":0/users/\u22ef/posts/\u22ef", - Methods: []string{"GET", "POST"}, + Endpoint: ":0/users/123/posts/\u22ef", + Methods: []string{"GET"}, + }, + { + Endpoint: ":80/users/\u22ef/posts/\u22ef", + Methods: []string{"POST"}, + }, + { + Endpoint: ":8770/users/blub/posts/101", + Methods: []string{"POST"}, }, }, }, @@ -238,17 +257,21 @@ func TestAnalyzeEndpointsWithInvalidURL(t *testing.T) { assert.Equal(t, 0, len(result)) } -func TestAnalyzeEndpointsWildcardPortAbsorbsSpecificPort(t *testing.T) { +// TestAnalyzeEndpoints_WildcardDoesNotContaminateUnrelatedPaths pins the bug +// flagged by upstream review on kubescape/storage#316: a single wildcard-port +// endpoint must NOT cause unrelated specific-port endpoints (different path) +// to be rewritten to :0. Only same-path siblings should fold into the wildcard. +func TestAnalyzeEndpoints_WildcardDoesNotContaminateUnrelatedPaths(t *testing.T) { analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(dynamicpathdetector.EndpointDynamicThreshold, nil) input := []types.HTTPEndpoint{ { - Endpoint: ":0/users/123", + Endpoint: ":0/health", Methods: []string{"GET"}, Direction: "outbound", }, { - Endpoint: ":80/users/456", + Endpoint: ":443/login", Methods: []string{"POST"}, Direction: "outbound", }, @@ -256,34 +279,98 @@ func TestAnalyzeEndpointsWildcardPortAbsorbsSpecificPort(t *testing.T) { result := dynamicpathdetector.AnalyzeEndpoints(&input, analyzer) + endpoints := make(map[string]bool, len(result)) for _, ep := range result { - port := ep.Endpoint[:len(":0")] - assert.Equal(t, ":0", port, "endpoint %s should have wildcard port", ep.Endpoint) + endpoints[ep.Endpoint] = true } + assert.Equal(t, 2, len(result), "unrelated paths must not be merged: got %v", endpoints) + assert.True(t, endpoints[":0/health"], "wildcard endpoint :0/health must be preserved") + assert.True(t, endpoints[":443/login"], "specific-port endpoint :443/login must keep its port (no wildcard sibling on the same path)") } -func TestAnalyzeEndpointsWildcardPortAfterSpecificPorts(t *testing.T) { +// TestAnalyzeEndpoints_SamePathSpecificFirstThenWildcard exercises the +// reverse-order case: the specific-port endpoint comes first in the slice, +// then a wildcard sibling on the SAME path. The two must merge into the +// wildcard. Without symmetric merging in MergeDuplicateEndpoints, the +// specific endpoint sticks around alongside the wildcard. +func TestAnalyzeEndpoints_SamePathSpecificFirstThenWildcard(t *testing.T) { analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(dynamicpathdetector.EndpointDynamicThreshold, nil) input := []types.HTTPEndpoint{ { - Endpoint: ":80/api/data", + Endpoint: ":443/login", + Methods: []string{"POST"}, + Direction: "outbound", + }, + { + Endpoint: ":0/login", + Methods: []string{"GET"}, + Direction: "outbound", + }, + } + + result := dynamicpathdetector.AnalyzeEndpoints(&input, analyzer) + + assert.Equal(t, 1, len(result), "specific-port sibling must fold into the wildcard regardless of order") + assert.Equal(t, ":0/login", result[0].Endpoint) + assert.ElementsMatch(t, []string{"GET", "POST"}, result[0].Methods, "methods from both ports must be merged") +} + +// TestAnalyzeEndpoints_NoWildcardKeepsSpecificPort asserts that without ANY +// wildcard sibling, specific-port endpoints stay specific. A regression here +// would mean the analyzer is wildcarding too aggressively. +func TestAnalyzeEndpoints_NoWildcardKeepsSpecificPort(t *testing.T) { + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(dynamicpathdetector.EndpointDynamicThreshold, nil) + + input := []types.HTTPEndpoint{ + { + Endpoint: ":443/login", + Methods: []string{"POST"}, + Direction: "outbound", + }, + } + + result := dynamicpathdetector.AnalyzeEndpoints(&input, analyzer) + + assert.Equal(t, 1, len(result)) + assert.Equal(t, ":443/login", result[0].Endpoint, "no wildcard sibling => port must be preserved") +} + +// TestAnalyzeEndpoints_OnlyMatchingPathsFoldIntoWildcard combines the +// wildcard-contamination case with the same-path-merge case to verify both +// invariants hold simultaneously. :0/api absorbs :80/api (same path); but +// :443/admin (different path, no wildcard sibling) keeps its port. +func TestAnalyzeEndpoints_OnlyMatchingPathsFoldIntoWildcard(t *testing.T) { + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(dynamicpathdetector.EndpointDynamicThreshold, nil) + + input := []types.HTTPEndpoint{ + { + Endpoint: ":0/api", Methods: []string{"GET"}, Direction: "outbound", }, { - Endpoint: ":0/api/info", + Endpoint: ":80/api", Methods: []string{"POST"}, Direction: "outbound", }, + { + Endpoint: ":443/admin", + Methods: []string{"DELETE"}, + Direction: "outbound", + }, } result := dynamicpathdetector.AnalyzeEndpoints(&input, analyzer) + endpoints := make(map[string][]string, len(result)) for _, ep := range result { - port := ep.Endpoint[:len(":0")] - assert.Equal(t, ":0", port, "endpoint %s should have wildcard port", ep.Endpoint) + endpoints[ep.Endpoint] = ep.Methods } + + assert.Equal(t, 2, len(result), "expected :0/api and :443/admin, got %v", endpoints) + assert.ElementsMatch(t, []string{"GET", "POST"}, endpoints[":0/api"], "/api siblings must merge into wildcard") + assert.ElementsMatch(t, []string{"DELETE"}, endpoints[":443/admin"], ":443/admin must NOT be wildcarded — no wildcard sibling on /admin") } func TestAnalyzeEndpointsMultiplePortsMergeIntoWildcard(t *testing.T) { @@ -332,3 +419,39 @@ func TestMergeDuplicateEndpointsWildcardPort(t *testing.T) { assert.Equal(t, ":0/api/data", result[0].Endpoint) assert.Equal(t, []string{"GET", "POST"}, result[0].Methods) } + +// TestMergeDuplicateEndpoints_SpecificFirstThenWildcard pins the reverse +// order — specific-port endpoint encountered first, wildcard sibling second. +// Without symmetric merging in MergeDuplicateEndpoints both entries survive, +// which CodeRabbit flagged on PR #316. Locking the contract here. +func TestMergeDuplicateEndpoints_SpecificFirstThenWildcard(t *testing.T) { + specificEP := &types.HTTPEndpoint{ + Endpoint: ":80/api/data", + Methods: []string{"POST"}, + Direction: "outbound", + } + wildcardEP := &types.HTTPEndpoint{ + Endpoint: ":0/api/data", + Methods: []string{"GET"}, + Direction: "outbound", + } + + result := dynamicpathdetector.MergeDuplicateEndpoints([]*types.HTTPEndpoint{specificEP, wildcardEP}) + + assert.Equal(t, 1, len(result), "wildcard sibling must absorb the earlier specific-port entry") + assert.Equal(t, ":0/api/data", result[0].Endpoint) + assert.ElementsMatch(t, []string{"GET", "POST"}, result[0].Methods) +} + +// TestMergeDuplicateEndpoints_NoWildcardKeepsAllSpecificPorts asserts that +// without a wildcard sibling, distinct (port,path) pairs all survive. +// A regression here would mean the merge is collapsing too aggressively. +func TestMergeDuplicateEndpoints_NoWildcardKeepsAllSpecificPorts(t *testing.T) { + a := &types.HTTPEndpoint{Endpoint: ":80/api/data", Methods: []string{"GET"}, Direction: "outbound"} + b := &types.HTTPEndpoint{Endpoint: ":443/api/data", Methods: []string{"POST"}, Direction: "outbound"} + c := &types.HTTPEndpoint{Endpoint: ":8080/api/data", Methods: []string{"PUT"}, Direction: "outbound"} + + result := dynamicpathdetector.MergeDuplicateEndpoints([]*types.HTTPEndpoint{a, b, c}) + + assert.Equal(t, 3, len(result), "no wildcard sibling => all specific-port endpoints must be kept") +} From 9afcfe7fdda0b63696866783e2eb539dfb0ffebf Mon Sep 17 00:00:00 2001 From: entlein Date: Wed, 29 Apr 2026 10:34:09 +0200 Subject: [PATCH 13/16] moving the constructor inside the test loop -- e2e tests are running Signed-off-by: entlein --- .../tests/analyze_endpoints_test.go | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/pkg/registry/file/dynamicpathdetector/tests/analyze_endpoints_test.go b/pkg/registry/file/dynamicpathdetector/tests/analyze_endpoints_test.go index ec7fe58a5..3f7700839 100644 --- a/pkg/registry/file/dynamicpathdetector/tests/analyze_endpoints_test.go +++ b/pkg/registry/file/dynamicpathdetector/tests/analyze_endpoints_test.go @@ -12,8 +12,6 @@ import ( ) func TestAnalyzeEndpoints(t *testing.T) { - analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(dynamicpathdetector.EndpointDynamicThreshold, nil) - tests := []struct { name string input []types.HTTPEndpoint @@ -94,18 +92,13 @@ func TestAnalyzeEndpoints(t *testing.T) { Methods: []string{"POST"}, }, }, - // NOTE: the analyzer trie persists across subtests in this - // table (analyzer is created outside the t.Run loop), so - // /users/\u22ef/posts/\u22ef has already been observed enough times - // for the :80 entry's trailing segment to be dynamicized; - // the :8770 path is fresh in the trie so it stays concrete. expected: []types.HTTPEndpoint{ { Endpoint: ":0/users/123/posts/\u22ef", Methods: []string{"GET"}, }, { - Endpoint: ":80/users/\u22ef/posts/\u22ef", + Endpoint: ":80/users/\u22ef/posts/101", Methods: []string{"POST"}, }, { @@ -175,6 +168,7 @@ func TestAnalyzeEndpoints(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs(dynamicpathdetector.EndpointDynamicThreshold, nil) result := dynamicpathdetector.AnalyzeEndpoints(&tt.input, analyzer) ja := jsonassert.New(t) for i := range result { From dd028cf30f706c480d95f526d66618c240371438 Mon Sep 17 00:00:00 2001 From: entlein Date: Wed, 29 Apr 2026 15:16:45 +0200 Subject: [PATCH 14/16] adding INternal vs EXternal to the hash that make the collapse unique Signed-off-by: entlein --- .../dynamicpathdetector/analyze_endpoints.go | 38 +++--- .../tests/analyze_endpoints_test.go | 116 ++++++++++++++++++ 2 files changed, 139 insertions(+), 15 deletions(-) diff --git a/pkg/registry/file/dynamicpathdetector/analyze_endpoints.go b/pkg/registry/file/dynamicpathdetector/analyze_endpoints.go index a0eba25a4..721406ecf 100644 --- a/pkg/registry/file/dynamicpathdetector/analyze_endpoints.go +++ b/pkg/registry/file/dynamicpathdetector/analyze_endpoints.go @@ -109,18 +109,22 @@ func splitEndpointPortAndPath(endpoint string) (string, string) { } // MergeDuplicateEndpoints folds duplicates and merges same-path specific-port -// endpoints into a wildcard-port (:0) sibling. The folding is symmetric: +// endpoints into a wildcard-port (:0) sibling. Folding is symmetric and is +// keyed on the same triple HTTPEndpoint.Equal compares — (Endpoint, +// Direction, Internal). An Internal=false endpoint will therefore NOT merge +// with an Internal=true sibling even if their path and direction match. // // - If a specific-port endpoint is encountered AFTER its :0 sibling, the // specific-port methods/headers are merged INTO the wildcard entry. // - If a specific-port endpoint is encountered BEFORE its :0 sibling, it // is initially recorded; when the wildcard arrives we sweep `seen` for -// same-(path, direction) specific-port siblings, fold them into the -// wildcard, and remove them from the output. +// same-(path, direction, Internal) specific-port siblings, fold them +// into the wildcard, and remove them from the output. // // This contract was tightened on the back of upstream review on // kubescape/storage#316 — a single :0 entry must NOT cause unrelated -// concrete-port endpoints to be wildcarded; only same-path siblings fold. +// concrete-port endpoints to be wildcarded; only same-path same-Internal +// siblings fold. func MergeDuplicateEndpoints(endpoints []*types.HTTPEndpoint) []*types.HTTPEndpoint { seen := make(map[string]*types.HTTPEndpoint) var newEndpoints []*types.HTTPEndpoint @@ -137,30 +141,30 @@ func MergeDuplicateEndpoints(endpoints []*types.HTTPEndpoint) []*types.HTTPEndpo if isWildcardPort(port) { // Wildcard arriving after specific-port siblings — sweep `seen` - // for any same-(path, direction) specific-port entries already - // recorded, fold them into the wildcard, then drop them from - // the output slice. - absorbed := false + // for any same-(path, direction, Internal) specific-port entries + // already recorded, fold them into the wildcard, then drop them + // from the output slice. for k, e := range seen { ePort, ePath := splitEndpointPortAndPath(e.Endpoint) - if isWildcardPort(ePort) || ePath != pathPart || e.Direction != endpoint.Direction { + if isWildcardPort(ePort) || ePath != pathPart || + e.Direction != endpoint.Direction || e.Internal != endpoint.Internal { continue } endpoint.Methods = MergeStrings(endpoint.Methods, e.Methods) mergeHeaders(endpoint, e) delete(seen, k) newEndpoints = removeEndpoint(newEndpoints, e) - absorbed = true } seen[key] = endpoint newEndpoints = append(newEndpoints, endpoint) - _ = absorbed continue } - // Specific port: if a wildcard sibling for the same (path, direction) - // is already in `seen`, fold this entry into it. - wildcardKey := fmt.Sprintf(":0%s|%s", pathPart, endpoint.Direction) + // Specific port: if a wildcard sibling for the same + // (path, direction, Internal) is already in `seen`, fold this entry + // into it. The wildcardKey shape MUST match getEndpointKey exactly so + // the lookup hits the same map slot the wildcard was inserted under. + wildcardKey := fmt.Sprintf(":0%s|%s|%t", pathPart, endpoint.Direction, endpoint.Internal) if existing, found := seen[wildcardKey]; found { existing.Methods = MergeStrings(existing.Methods, endpoint.Methods) mergeHeaders(existing, endpoint) @@ -186,8 +190,12 @@ func removeEndpoint(s []*types.HTTPEndpoint, target *types.HTTPEndpoint) []*type return s } +// getEndpointKey returns a key that uniquely identifies an HTTPEndpoint by +// the same fields HTTPEndpoint.Equal compares: Endpoint, Direction, Internal. +// Keep this in sync with the wildcardKey shape constructed in +// MergeDuplicateEndpoints — the two MUST hash identical entries identically. func getEndpointKey(endpoint *types.HTTPEndpoint) string { - return fmt.Sprintf("%s|%s", endpoint.Endpoint, endpoint.Direction) + return fmt.Sprintf("%s|%s|%t", endpoint.Endpoint, endpoint.Direction, endpoint.Internal) } func mergeHeaders(existing, new *types.HTTPEndpoint) { diff --git a/pkg/registry/file/dynamicpathdetector/tests/analyze_endpoints_test.go b/pkg/registry/file/dynamicpathdetector/tests/analyze_endpoints_test.go index 3f7700839..460d4cd12 100644 --- a/pkg/registry/file/dynamicpathdetector/tests/analyze_endpoints_test.go +++ b/pkg/registry/file/dynamicpathdetector/tests/analyze_endpoints_test.go @@ -449,3 +449,119 @@ func TestMergeDuplicateEndpoints_NoWildcardKeepsAllSpecificPorts(t *testing.T) { assert.Equal(t, 3, len(result), "no wildcard sibling => all specific-port endpoints must be kept") } + +// --------------------------------------------------------------------------- +// Internal-field isolation tests. +// +// `HTTPEndpoint.Equal` distinguishes endpoints by (Endpoint, Direction, +// Internal). The merge key in `MergeDuplicateEndpoints` and the wildcard +// sweep must therefore also distinguish Internal — otherwise an +// internally-originating endpoint can absorb an externally-originating one +// (or vice versa) just because they share path + direction. +// +// Flagged by upstream review on kubescape/storage#316 (matthyx). +// --------------------------------------------------------------------------- + +// TestMergeDuplicateEndpoints_InternalFieldDistinguishesDuplicates asserts +// that two endpoints differing ONLY in Internal are NOT collapsed by the +// duplicate-key check at the top of the merge loop. +func TestMergeDuplicateEndpoints_InternalFieldDistinguishesDuplicates(t *testing.T) { + external := &types.HTTPEndpoint{ + Endpoint: ":443/login", + Methods: []string{"POST"}, + Direction: "outbound", + Internal: false, + } + internal := &types.HTTPEndpoint{ + Endpoint: ":443/login", + Methods: []string{"GET"}, + Direction: "outbound", + Internal: true, + } + + result := dynamicpathdetector.MergeDuplicateEndpoints([]*types.HTTPEndpoint{external, internal}) + + assert.Equal(t, 2, len(result), + "endpoints with different Internal must NOT merge (HTTPEndpoint.Equal distinguishes Internal)") + // Each output must keep its own Internal value and methods. + for _, ep := range result { + switch ep.Internal { + case false: + assert.Equal(t, []string{"POST"}, ep.Methods, "external endpoint methods must not be polluted") + case true: + assert.Equal(t, []string{"GET"}, ep.Methods, "internal endpoint methods must not be polluted") + } + } +} + +// TestMergeDuplicateEndpoints_InternalFieldGuardsWildcardAbsorbingPrior +// pins the wildcard-after-specific path: an Internal=false :0 wildcard must +// NOT sweep up a previously-recorded Internal=true specific-port sibling. +func TestMergeDuplicateEndpoints_InternalFieldGuardsWildcardAbsorbingPrior(t *testing.T) { + specificInternal := &types.HTTPEndpoint{ + Endpoint: ":443/login", + Methods: []string{"POST"}, + Direction: "outbound", + Internal: true, + } + wildcardExternal := &types.HTTPEndpoint{ + Endpoint: ":0/login", + Methods: []string{"GET"}, + Direction: "outbound", + Internal: false, + } + + result := dynamicpathdetector.MergeDuplicateEndpoints([]*types.HTTPEndpoint{specificInternal, wildcardExternal}) + + assert.Equal(t, 2, len(result), + "wildcard with Internal=false must NOT absorb a specific-port sibling with Internal=true") +} + +// TestMergeDuplicateEndpoints_InternalFieldGuardsSpecificFoldingIntoWildcard +// pins the wildcard-first path: a previously-recorded Internal=true wildcard +// must NOT absorb a later Internal=false specific-port sibling. +func TestMergeDuplicateEndpoints_InternalFieldGuardsSpecificFoldingIntoWildcard(t *testing.T) { + wildcardInternal := &types.HTTPEndpoint{ + Endpoint: ":0/login", + Methods: []string{"GET"}, + Direction: "outbound", + Internal: true, + } + specificExternal := &types.HTTPEndpoint{ + Endpoint: ":443/login", + Methods: []string{"POST"}, + Direction: "outbound", + Internal: false, + } + + result := dynamicpathdetector.MergeDuplicateEndpoints([]*types.HTTPEndpoint{wildcardInternal, specificExternal}) + + assert.Equal(t, 2, len(result), + "specific-port with Internal=false must NOT fold into a wildcard sibling with Internal=true") +} + +// TestMergeDuplicateEndpoints_InternalFieldMatching_StillFolds is the +// positive sanity check: when Internal DOES match, the existing +// path+direction merge contract still holds. A regression here would mean +// the Internal guard accidentally blocks legitimate folding. +func TestMergeDuplicateEndpoints_InternalFieldMatching_StillFolds(t *testing.T) { + wildcard := &types.HTTPEndpoint{ + Endpoint: ":0/login", + Methods: []string{"GET"}, + Direction: "outbound", + Internal: true, + } + specific := &types.HTTPEndpoint{ + Endpoint: ":443/login", + Methods: []string{"POST"}, + Direction: "outbound", + Internal: true, + } + + result := dynamicpathdetector.MergeDuplicateEndpoints([]*types.HTTPEndpoint{wildcard, specific}) + + assert.Equal(t, 1, len(result), "matching Internal => specific-port sibling still folds into wildcard") + assert.Equal(t, ":0/login", result[0].Endpoint) + assert.True(t, result[0].Internal, "merged endpoint must preserve Internal=true") + assert.ElementsMatch(t, []string{"GET", "POST"}, result[0].Methods) +} From bff572a0f9a96e32ac943eff2fd9119d6b970cba Mon Sep 17 00:00:00 2001 From: entlein Date: Fri, 1 May 2026 18:43:34 +0200 Subject: [PATCH 15/16] addressing the tests, the now more defensive guards for empty, more explicit tests and behavior for empty versus wildcard, restores the benchmark test and addresses the nitpick comments - does not include the performance issue re the zero-alloc Signed-off-by: entlein --- .../file/applicationprofile_processor.go | 2 +- .../file/applicationprofile_processor_test.go | 12 +- .../file/containerprofile_processor.go | 2 +- .../dynamicpathdetector/analyze_endpoints.go | 29 ++- .../analyze_endpoints_internal_test.go | 55 +++++ .../file/dynamicpathdetector/analyzer.go | 81 +++++-- .../tests/benchmark_test.go | 46 +++- .../tests/coverage_test.go | 227 ++++++++++++++++++ .../dynamicpathdetector/tests/profile_test.go | 35 ++- .../file/dynamicpathdetector/types.go | 16 +- 10 files changed, 454 insertions(+), 51 deletions(-) create mode 100644 pkg/registry/file/dynamicpathdetector/analyze_endpoints_internal_test.go diff --git a/pkg/registry/file/applicationprofile_processor.go b/pkg/registry/file/applicationprofile_processor.go index e823fe4a9..ee0509eb6 100644 --- a/pkg/registry/file/applicationprofile_processor.go +++ b/pkg/registry/file/applicationprofile_processor.go @@ -107,7 +107,7 @@ func (a *ApplicationProfileProcessor) SetStorage(containerProfileStorage Contain } func deflateApplicationProfileContainer(container softwarecomposition.ApplicationProfileContainer, sbomSet mapset.Set[string]) softwarecomposition.ApplicationProfileContainer { - opens, err := dynamicpathdetector.AnalyzeOpens(container.Opens, dynamicpathdetector.NewPathAnalyzerWithConfigs(dynamicpathdetector.OpenDynamicThreshold, dynamicpathdetector.DefaultCollapseConfigs), sbomSet) + opens, err := dynamicpathdetector.AnalyzeOpens(container.Opens, dynamicpathdetector.NewPathAnalyzerWithConfigs(dynamicpathdetector.OpenDynamicThreshold, dynamicpathdetector.DefaultCollapseConfigs()), sbomSet) if err != nil { logger.L().Debug("falling back to DeflateStringer for opens", loggerhelpers.Error(err)) opens = DeflateStringer(container.Opens) diff --git a/pkg/registry/file/applicationprofile_processor_test.go b/pkg/registry/file/applicationprofile_processor_test.go index e09ed0e15..62574f116 100644 --- a/pkg/registry/file/applicationprofile_processor_test.go +++ b/pkg/registry/file/applicationprofile_processor_test.go @@ -344,8 +344,16 @@ func TestDeflateApplicationProfileContainer_MixedPathsCollapse(t *testing.T) { }) } - // /etc uses the /etc config threshold from DefaultCollapseConfigs (100) - etcThreshold := 100 + // /etc uses the /etc config threshold from DefaultCollapseConfigs. + // Derive from the live config so this test stays in sync if the + // production threshold for /etc ever changes — hardcoding 100 here + // previously meant the test would silently pass even when + // DefaultCollapseConfigs drifted (CodeRabbit C5). + etcAnalyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs( + dynamicpathdetector.OpenDynamicThreshold, + dynamicpathdetector.DefaultCollapseConfigs(), + ) + etcThreshold := etcAnalyzer.FindConfigForPath("/etc/file").Threshold for i := 0; i < etcThreshold+1; i++ { opens = append(opens, softwarecomposition.OpenCalls{ Path: fmt.Sprintf("/etc/conf%d.cfg", i), diff --git a/pkg/registry/file/containerprofile_processor.go b/pkg/registry/file/containerprofile_processor.go index b8df5897a..29077b338 100644 --- a/pkg/registry/file/containerprofile_processor.go +++ b/pkg/registry/file/containerprofile_processor.go @@ -743,7 +743,7 @@ func (a *ContainerProfileProcessor) getAggregatedData(ctx context.Context, key s } func DeflateContainerProfileSpec(container softwarecomposition.ContainerProfileSpec, sbomSet mapset.Set[string]) softwarecomposition.ContainerProfileSpec { - opens, err := dynamicpathdetector.AnalyzeOpens(container.Opens, dynamicpathdetector.NewPathAnalyzerWithConfigs(dynamicpathdetector.OpenDynamicThreshold, dynamicpathdetector.DefaultCollapseConfigs), sbomSet) + opens, err := dynamicpathdetector.AnalyzeOpens(container.Opens, dynamicpathdetector.NewPathAnalyzerWithConfigs(dynamicpathdetector.OpenDynamicThreshold, dynamicpathdetector.DefaultCollapseConfigs()), sbomSet) if err != nil { logger.L().Debug("ContainerProfileProcessor.deflateContainerProfileSpec - falling back to DeflateStringer for opens", loggerhelpers.Error(err)) opens = DeflateStringer(container.Opens) diff --git a/pkg/registry/file/dynamicpathdetector/analyze_endpoints.go b/pkg/registry/file/dynamicpathdetector/analyze_endpoints.go index 721406ecf..278a158e5 100644 --- a/pkg/registry/file/dynamicpathdetector/analyze_endpoints.go +++ b/pkg/registry/file/dynamicpathdetector/analyze_endpoints.go @@ -7,6 +7,8 @@ import ( "strings" mapset "github.com/deckarep/golang-set/v2" + "github.com/kubescape/go-logger" + loggerhelpers "github.com/kubescape/go-logger/helpers" types "github.com/kubescape/storage/pkg/apis/softwarecomposition" ) @@ -99,8 +101,26 @@ func AnalyzeURL(urlString string, analyzer *PathAnalyzer) (string, error) { return ":" + port + path, nil } +// splitEndpointPortAndPath splits the canonical `:` form +// produced by AnalyzeURL into its (port, path) parts. +// +// Defensive contract: AnalyzeURL guarantees a leading `:` and a port +// segment, but callers and tests sometimes pass bare paths (e.g. +// "/health") for ad-hoc lookups. To keep merge keys deterministic, +// this helper returns empty port + leading-slash-normalised path for +// any input that does not start with `:`. The empty string returns +// ("", "/") to match the original fall-through behavior. func splitEndpointPortAndPath(endpoint string) (string, string) { - s := strings.TrimPrefix(endpoint, ":") + if !strings.HasPrefix(endpoint, ":") { + if endpoint == "" { + return "", "/" + } + if !strings.HasPrefix(endpoint, "/") { + endpoint = "/" + endpoint + } + return "", endpoint + } + s := endpoint[1:] idx := strings.Index(s, "/") if idx == -1 { return s, "/" @@ -220,7 +240,12 @@ func mergeHeaders(existing, new *types.HTTPEndpoint) { rawJSON, err := json.Marshal(existingHeaders) if err != nil { - fmt.Println("Error marshaling JSON:", err) + // Don't pollute stdout from a library function. The caller has + // no signal-back path here (mergeHeaders is a void helper) so + // log at Debug and bail — leaving Headers untouched is the + // safer choice than corrupting them with a partial marshal. + logger.L().Debug("mergeHeaders: failed to marshal merged headers, leaving existing untouched", + loggerhelpers.Error(err)) return } diff --git a/pkg/registry/file/dynamicpathdetector/analyze_endpoints_internal_test.go b/pkg/registry/file/dynamicpathdetector/analyze_endpoints_internal_test.go new file mode 100644 index 000000000..b10fe21eb --- /dev/null +++ b/pkg/registry/file/dynamicpathdetector/analyze_endpoints_internal_test.go @@ -0,0 +1,55 @@ +package dynamicpathdetector + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +// TestSplitEndpointPortAndPath_DefensiveContract pins the inputs that +// AnalyzeURL is supposed to produce (`:`) AND the defensive +// behavior for bare-path / empty / no-leading-slash inputs that may +// arrive via ad-hoc lookups or tests. Without the guard, "foo" was +// returned as ("foo", "/") — silently treating an opaque token as a +// port number. Flagged on upstream PR #316 by CodeRabbit (C7). +func TestSplitEndpointPortAndPath_DefensiveContract(t *testing.T) { + tests := []struct { + name string + input string + wantPort string + wantPath string + }{ + // Canonical AnalyzeURL output. + {"empty", "", "", "/"}, + {"port_only", ":80", "80", "/"}, + {"port_with_root_path", ":80/", "80", "/"}, + {"port_with_path", ":80/health", "80", "/health"}, + {"wildcard_port", ":0", "0", "/"}, + {"wildcard_port_with_path", ":0/api/users", "0", "/api/users"}, + {"port_with_deep_path", ":443/v1/items/42", "443", "/v1/items/42"}, + + // Defensive — bare paths arriving without the `:` prefix. + {"bare_path", "/health", "", "/health"}, + {"bare_root", "/", "", "/"}, + {"bare_deep_path", "/v1/items/42", "", "/v1/items/42"}, + + // Defensive — opaque token without a leading slash. Previous + // behavior silently returned ("foo", "/") which would be + // indistinguishable from port="foo". The guard normalises this + // to ("", "/foo"). + {"opaque_token", "foo", "", "/foo"}, + {"opaque_with_dot", "host.example.com", "", "/host.example.com"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + gotPort, gotPath := splitEndpointPortAndPath(tt.input) + assert.Equal(t, tt.wantPort, gotPort, + "splitEndpointPortAndPath(%q) port = %q, want %q", + tt.input, gotPort, tt.wantPort) + assert.Equal(t, tt.wantPath, gotPath, + "splitEndpointPortAndPath(%q) path = %q, want %q", + tt.input, gotPath, tt.wantPath) + }) + } +} diff --git a/pkg/registry/file/dynamicpathdetector/analyzer.go b/pkg/registry/file/dynamicpathdetector/analyzer.go index 9a0b517bc..74e31aca4 100644 --- a/pkg/registry/file/dynamicpathdetector/analyzer.go +++ b/pkg/registry/file/dynamicpathdetector/analyzer.go @@ -347,17 +347,51 @@ func shallowChildrenCopy(src, dst *SegmentNode) { } } -// CompareDynamic checks whether `regularPath` is matched by `dynamicPath`, -// where `dynamicPath` may contain DynamicIdentifier (⋯, single-segment -// wildcard) or WildcardIdentifier (*, zero-or-more-segment wildcard). -// The previous implementation only handled DynamicIdentifier, causing -// explicit `/etc/*` profile entries to never match at runtime — the -// node-agent R0002 rule (Files Access Anomalies) uses this to decide -// whether a file access is in-profile. +// CompareDynamic checks whether `regularPath` is matched by `dynamicPath`. +// The dynamic path may contain DynamicIdentifier (⋯, exactly-one-segment +// wildcard) or WildcardIdentifier (*, zero-or-more-segment mid-path / +// one-or-more-segment trailing wildcard). The node-agent R0002 rule +// (Files Access Anomalies) uses this at every file-open to decide whether +// the access is in-profile. +// +// Anchoring contract: +// - Anchored patterns (start with `/`): `/etc/*` matches files UNDER +// /etc but NOT the bare `/etc` directory itself, mirroring shell +// glob semantics. This avoids R0002 silently allowing access to a +// profiled directory's parent. +// - Unanchored `*` (no leading slash): explicit catch-all that also +// matches the root path `/`. The only way to whitelist `/` itself +// is an explicit unanchored `*`. +// +// Trailing-slash insensitivity: `/etc/` is treated as `/etc`, and +// `/etc/passwd/` as `/etc/passwd`. Trailing empty path components from +// `strings.Split` are trimmed so `len(regular) > 0` correctly reflects +// the presence of a real path tail when matching trailing `*`. +// +// The empty regular path (`""`) is treated as "no path" and matches +// nothing — distinct from the root path `/`, which matches unanchored +// `*` per the contract above. func CompareDynamic(dynamicPath, regularPath string) bool { - dynamicSegments := strings.Split(dynamicPath, "/") - regularSegments := strings.Split(regularPath, "/") - return compareSegments(dynamicSegments, regularSegments) + // Empty inputs match nothing. Note that splitPath("") and splitPath("/") + // both yield [""] after trim, so without this guard an empty profile + // entry would silently match the root path. + if dynamicPath == "" || regularPath == "" { + return false + } + return compareSegments(splitPath(dynamicPath), splitPath(regularPath)) +} + +// splitPath splits a path on `/` and trims trailing empty segments +// produced by trailing slashes (e.g. `/etc/` -> ["", "etc"] not +// ["", "etc", ""]). The leading empty segment from a leading slash is +// preserved as the anchor marker. Single-element results are not +// trimmed so the root path `/` retains its `[""]` shape. +func splitPath(p string) []string { + s := strings.Split(p, "/") + for len(s) > 1 && s[len(s)-1] == "" { + s = s[:len(s)-1] + } + return s } func compareSegments(dynamic, regular []string) bool { @@ -365,20 +399,23 @@ func compareSegments(dynamic, regular []string) bool { return len(regular) == 0 } if dynamic[0] == WildcardIdentifier { - // A trailing `*` matches any remaining path tail (including empty). + // Trailing `*` matches one OR MORE remaining segments — never + // zero. This is what makes `/etc/*` not match the bare `/etc` + // directory, while still matching `/etc/passwd` and any deeper + // path. The unanchored-`*` case (regular path is `/`, regular + // slice is [""]) returns true because len(regular) == 1. if len(dynamic) == 1 { - return true + return len(regular) > 0 } - // Try to match the rest of the dynamic pattern starting at every - // position in the regular tail — including i == 0 (the wildcard - // consumed zero segments) and every later offset (wildcard - // consumed i segments). No optimistic peek at dynamic[1]: that - // optimization used to require regular[i] to literally equal - // dynamic[1], which is wrong whenever dynamic[1] is itself - // another `*` (consecutive wildcards like `/*/*` would never - // recurse and thus never match — user-authored profiles can - // contain literal /*/* patterns even though analyzer-generated - // ones are squashed by collapseAdjacentDynamicIdentifiers). + // Mid-path `*`: zero-or-more semantics. Try every offset + // including i == 0 (wildcard consumed zero segments). No + // optimistic peek at dynamic[1]: that optimization used to + // require regular[i] to literally equal dynamic[1], which is + // wrong whenever dynamic[1] is itself another `*` (consecutive + // wildcards like `/*/*` would never recurse and thus never + // match — user-authored profiles can contain literal /*/* + // patterns even though analyzer-generated ones are squashed by + // collapseAdjacentDynamicIdentifiers). for i := 0; i <= len(regular); i++ { if compareSegments(dynamic[1:], regular[i:]) { return true diff --git a/pkg/registry/file/dynamicpathdetector/tests/benchmark_test.go b/pkg/registry/file/dynamicpathdetector/tests/benchmark_test.go index 09dbdf56a..d1a8f2a24 100644 --- a/pkg/registry/file/dynamicpathdetector/tests/benchmark_test.go +++ b/pkg/registry/file/dynamicpathdetector/tests/benchmark_test.go @@ -72,14 +72,44 @@ func BenchmarkAnalyzeOpensVsDeflateStringer(b *testing.B) { }) } -// func BenchmarkCompareDynamic(b *testing.B) { -// dynamicPath := "/api/\u22ef/\u22ef" -// regularPath := "/api/users/123" -// for i := 0; i < b.N; i++ { -// _ = dynamicpathdetector.CompareDynamic(dynamicPath, regularPath) -// } -// b.ReportAllocs() -// } +// BenchmarkCompareDynamic exercises the matcher under realistic shapes. +// Run with -benchmem; the goal is the zero-alloc target Matthias called +// out on upstream PR #316. Until the perf rewrite lands the numbers +// document the current cost so regressions show up. Inputs are split +// across cases so the benchmark covers: +// +// - DynamicIdentifier-only paths (auto-generated short patterns) +// - DynamicIdentifier-only paths (deeper, more typical R0002 traffic) +// - WildcardIdentifier with mid-path zero-or-more semantics +// - Trailing WildcardIdentifier (the regression-prone path) +// - Anchored / unanchored boundary cases (the new `*` vs `/*` contract) +func BenchmarkCompareDynamic(b *testing.B) { + cases := []struct { + name string + dynamic string + regular string + }{ + {"ellipsis_short", "/api/\u22ef/\u22ef", "/api/users/123"}, + {"ellipsis_deep", "/api/\u22ef/\u22ef/\u22ef/\u22ef", "/api/users/123/posts/42"}, + {"trailing_star", "/etc/*", "/etc/ssh/sshd_config"}, + {"trailing_star_no_match_on_parent", "/etc/*", "/etc"}, + {"mid_star_zero_consumed", "/a/*/b", "/a/b"}, + {"mid_star_many_consumed", "/a/*/b", "/a/x/y/z/b"}, + {"anchored_root_no_match", "/*", "/"}, + {"unanchored_star_root", "*", "/"}, + {"deep_literal_match", "/var/log/syslog", "/var/log/syslog"}, + {"deep_literal_mismatch", "/var/log/syslog", "/var/log/messages"}, + } + for _, c := range cases { + b.Run(c.name, func(b *testing.B) { + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + _ = dynamicpathdetector.CompareDynamic(c.dynamic, c.regular) + } + }) + } +} func generateMixedPaths(count int, fixedLength int) []string { paths := make([]string, count) diff --git a/pkg/registry/file/dynamicpathdetector/tests/coverage_test.go b/pkg/registry/file/dynamicpathdetector/tests/coverage_test.go index e6b6ab057..3b0956572 100644 --- a/pkg/registry/file/dynamicpathdetector/tests/coverage_test.go +++ b/pkg/registry/file/dynamicpathdetector/tests/coverage_test.go @@ -6,6 +6,7 @@ import ( "github.com/kubescape/storage/pkg/registry/file/dynamicpathdetector" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) // configThreshold returns the collapse threshold for the given path prefix @@ -418,3 +419,229 @@ func TestCompareDynamic_WildcardRegressions(t *testing.T) { }) } } + +// TestCompareDynamic_AnchoringAndTrailing pins the trailing-`*` / +// anchoring contract reported on upstream PR #316. +// +// The first wildcard-aware implementation made a trailing `*` match +// zero-or-more remaining segments, so `/etc/*` silently matched the +// bare `/etc` directory — widening R0002's blind spot to cover the +// profiled directory's parent. Standard shell glob requires trailing +// `*` to consume one or more segments. Anchoring rules: +// +// - Anchored: leading `/` makes `/*` "any path strictly under /" +// and explicitly excludes the bare `/` directory. +// - Unanchored: a bare `*` (no leading slash) is the only way to +// allowlist the root path itself. +// +// Trailing slashes on the regular path are normalized away so +// `/etc/passwd/` is treated as `/etc/passwd`. +func TestCompareDynamic_AnchoringAndTrailing(t *testing.T) { + tests := []struct { + name string + dynamic string + regular string + want bool + }{ + // Root-only cases — anchored vs unanchored distinction. + {"anchored_star_does_not_match_root", "/*", "/", false}, + {"anchored_star_does_not_match_empty", "/*", "", false}, + {"anchored_star_matches_top_level_child", "/*", "/foo", true}, + {"anchored_star_matches_deeper_child", "/*", "/foo/bar", true}, + {"unanchored_star_matches_root", "*", "/", true}, + {"unanchored_star_matches_top_level_child", "*", "/foo", true}, + {"unanchored_star_does_not_match_empty", "*", "", false}, + + // Bare-parent boundary — the original /etc/* regression. + {"trailing_star_does_not_match_bare_parent", "/etc/*", "/etc", false}, + {"trailing_star_does_not_match_parent_with_slash", "/etc/*", "/etc/", false}, + {"trailing_star_matches_immediate_child", "/etc/*", "/etc/passwd", true}, + {"trailing_star_matches_deep_child", "/etc/*", "/etc/ssh/sshd_config", true}, + {"trailing_star_matches_child_with_trailing_slash", "/etc/*", "/etc/passwd/", true}, + {"deep_trailing_star_does_not_match_short_path", "/var/log/*", "/var/log", false}, + {"deep_trailing_star_does_not_match_grandparent", "/var/log/*", "/var", false}, + {"deep_trailing_star_matches_child", "/var/log/*", "/var/log/syslog", true}, + + // Multiple trailing wildcards — zero-or-more mid-* + one-or-more + // final-* together. The mid-* may consume zero, so /etc/*/* still + // matches /etc/ssh (one segment) by having the inner * consume 0 + // and the trailing * consume the segment. + {"double_trailing_does_not_match_parent", "/etc/*/*", "/etc", false}, + {"double_trailing_does_not_match_parent_slash", "/etc/*/*", "/etc/", false}, + {"double_trailing_matches_one_child", "/etc/*/*", "/etc/ssh", true}, + {"double_trailing_matches_two_children", "/etc/*/*", "/etc/ssh/sshd_config", true}, + {"double_trailing_matches_deep", "/etc/*/*", "/etc/ssh/dir/file", true}, + + // Empty / bare-pattern edges. + {"empty_dynamic_does_not_match_path", "", "/foo", false}, + {"empty_dynamic_does_not_match_root", "", "/", false}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := dynamicpathdetector.CompareDynamic(tt.dynamic, tt.regular) + assert.Equal(t, tt.want, got, + "CompareDynamic(%q, %q) = %v, want %v", tt.dynamic, tt.regular, got, tt.want) + }) + } +} + +// TestCompareDynamic_EllipsisAndStar pins the interaction between the +// two wildcard kinds: +// +// - DynamicIdentifier (⋯) consumes EXACTLY ONE segment. +// - WildcardIdentifier (*) consumes ZERO-OR-MORE segments mid-path +// and ONE-OR-MORE segments when trailing. +// +// Mixing them (e.g. `/⋯/*`) is the analyzer's normal output for a +// fully-collapsed grandchild branch: ⋯ pins the immediate child to +// "any single segment" and * accepts the deeper tail. +func TestCompareDynamic_EllipsisAndStar(t *testing.T) { + tests := []struct { + name string + dynamic string + regular string + want bool + }{ + // ⋯ alone: exactly one segment. + {"ellipsis_matches_exactly_one", "/⋯/foo", "/x/foo", true}, + {"ellipsis_does_not_consume_zero", "/⋯/foo", "/foo", false}, + {"ellipsis_does_not_consume_two", "/⋯/foo", "/x/y/foo", false}, + + // ⋯ then trailing *: ⋯ consumes 1, * needs ≥1 more. + {"ellipsis_then_trailing_star_two_segments", "/⋯/*", "/x/y", true}, + {"ellipsis_then_trailing_star_three_segments", "/⋯/*", "/x/y/z", true}, + {"ellipsis_then_trailing_star_one_segment_fails", "/⋯/*", "/x", false}, + {"ellipsis_then_trailing_star_root_fails", "/⋯/*", "/", false}, + + // Mid-* before ⋯: * may consume zero, ⋯ still needs exactly one. + {"star_then_ellipsis_two_segments", "/*/⋯", "/a/b", true}, + {"star_consumed_zero_then_ellipsis_matches_one", "/*/⋯", "/b", true}, + {"star_then_ellipsis_one_segment_fails_when_zero_consumed", "/*/⋯", "/", false}, + + // Nested ⋯. + {"nested_ellipsis_matches_two", "/⋯/⋯/foo", "/x/y/foo", true}, + {"nested_ellipsis_does_not_match_one", "/⋯/⋯/foo", "/x/foo", false}, + + // * literal * pattern around a static segment. + {"star_literal_star_matches", "/*/etc/*", "/foo/etc/passwd", true}, + {"star_literal_star_no_trailing_segment_fails", "/*/etc/*", "/foo/etc", false}, + {"star_literal_star_no_leading_consumed_zero_matches", "/*/etc/*", "/etc/passwd", true}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := dynamicpathdetector.CompareDynamic(tt.dynamic, tt.regular) + assert.Equal(t, tt.want, got, + "CompareDynamic(%q, %q) = %v, want %v", tt.dynamic, tt.regular, got, tt.want) + }) + } +} + +// TestCompareDynamic_MidPathStarZeroOrMore explicitly pins the +// zero-or-more semantics for `*` when it appears mid-path. This is +// distinct from trailing `*` (which is one-or-more) and is what allows +// auto-generated patterns like `/foo/*/bar` to also match `/foo/bar` +// (the wildcard consumed nothing, then the literal segment matched). +func TestCompareDynamic_MidPathStarZeroOrMore(t *testing.T) { + tests := []struct { + name string + dynamic string + regular string + want bool + }{ + {"mid_star_consumes_zero", "/a/*/b", "/a/b", true}, + {"mid_star_consumes_one", "/a/*/b", "/a/x/b", true}, + {"mid_star_consumes_many", "/a/*/b", "/a/x/y/b", true}, + {"mid_star_literal_after_mismatches", "/a/*/b", "/a/x/c", false}, + {"mid_star_literal_prefix_mismatch", "/a/*/b", "/z/x/b", false}, + + // Consecutive mid-stars — both can independently consume zero. + {"consecutive_mid_star_both_zero", "/a/*/*/b", "/a/b", true}, + {"consecutive_mid_star_one_zero_one_one", "/a/*/*/b", "/a/x/b", true}, + {"consecutive_mid_star_both_one", "/a/*/*/b", "/a/x/y/b", true}, + {"consecutive_mid_star_deeper", "/a/*/*/b", "/a/x/y/z/b", true}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := dynamicpathdetector.CompareDynamic(tt.dynamic, tt.regular) + assert.Equal(t, tt.want, got, + "CompareDynamic(%q, %q) = %v, want %v", tt.dynamic, tt.regular, got, tt.want) + }) + } +} + +// TestDefaultCollapseConfigs_DefensiveCopy pins the contract that the +// public DefaultCollapseConfigs() accessor returns a fresh slice on +// every call, so callers cannot accidentally mutate the package-level +// state. Without this guard, a downstream consumer modifying the +// returned slice (sorting, appending, swapping prefixes) would silently +// affect every subsequent call, including AnalyzeOpens in the storage +// deflate path. The unexported var is the canonical source of truth. +func TestDefaultCollapseConfigs_DefensiveCopy(t *testing.T) { + first := dynamicpathdetector.DefaultCollapseConfigs() + require.NotEmpty(t, first, "default configs must not be empty") + + // Mutate the returned slice in two ways: change a Threshold and + // append a junk config. Neither should leak to the next call. + first[0].Threshold = 999_999 + first = append(first, dynamicpathdetector.CollapseConfig{ + Prefix: "/poisoned", Threshold: 1, + }) + + second := dynamicpathdetector.DefaultCollapseConfigs() + assert.NotEqual(t, 999_999, second[0].Threshold, + "mutating the first call's slice must not change the package state") + for _, cfg := range second { + assert.NotEqual(t, "/poisoned", cfg.Prefix, + "appending to the first call's slice must not leak into the second call") + } + + // Also assert the two slices are distinct backing arrays — without + // this, len(second) would happen to be safe but a future caller + // reading first[len-1] could observe the appended element. + if len(first) > 0 && len(second) > 0 { + // Address-of-element comparison is sufficient; if the underlying + // array is shared, &first[0] == &second[0]. + assert.NotSame(t, &first[0], &second[0], + "DefaultCollapseConfigs must return a fresh backing array") + } +} + +// TestCompareDynamic_PathSeparatorEdges documents how `/`-related +// edges are normalized: trailing slashes are insignificant, the +// regular path `""` is treated as no-path (matches nothing), and the +// internal split-and-trim normalization is exercised on both sides. +func TestCompareDynamic_PathSeparatorEdges(t *testing.T) { + tests := []struct { + name string + dynamic string + regular string + want bool + }{ + // Trailing slash on regular — should match same as without. + {"trailing_slash_on_regular_matches_literal", "/etc/passwd", "/etc/passwd/", true}, + {"trailing_slash_on_regular_for_directory_match", "/etc", "/etc/", true}, + + // Trailing slash on dynamic — should match same as without. + {"trailing_slash_on_dynamic_literal", "/etc/passwd/", "/etc/passwd", true}, + {"trailing_slash_on_dynamic_with_star", "/etc/*/", "/etc/passwd", true}, + + // Empty regular path — matches nothing, including the bare star. + {"empty_regular_does_not_match_anchored", "/foo", "", false}, + {"empty_regular_does_not_match_unanchored_literal", "foo", "", false}, + {"empty_regular_does_not_match_star", "*", "", false}, + + // Empty dynamic — matches nothing. + {"empty_dynamic_does_not_match_anything", "", "/foo", false}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := dynamicpathdetector.CompareDynamic(tt.dynamic, tt.regular) + assert.Equal(t, tt.want, got, + "CompareDynamic(%q, %q) = %v, want %v", tt.dynamic, tt.regular, got, tt.want) + }) + } +} diff --git a/pkg/registry/file/dynamicpathdetector/tests/profile_test.go b/pkg/registry/file/dynamicpathdetector/tests/profile_test.go index d9db951c8..b70adb5ee 100644 --- a/pkg/registry/file/dynamicpathdetector/tests/profile_test.go +++ b/pkg/registry/file/dynamicpathdetector/tests/profile_test.go @@ -20,7 +20,6 @@ package dynamicpathdetectortests import ( "flag" - "fmt" "os" "path/filepath" "runtime" @@ -49,7 +48,10 @@ func TestProfileAnalyzePath(t *testing.T) { // Generate a representative mixed workload once, outside the measured // section, so its allocations don't show up in the profile. paths := generateMixedPaths(10000, 0) - analyzer := dynamicpathdetector.NewPathAnalyzer(100) + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs( + dynamicpathdetector.OpenDynamicThreshold, + dynamicpathdetector.DefaultCollapseConfigs(), + ) identifier := "profile" // Warm up the analyzer so the trie is populated. Steady-state calls @@ -80,7 +82,9 @@ func TestProfileAnalyzePath(t *testing.T) { for i := 0; i < *profileIters; i++ { if _, err := analyzer.AnalyzePath(paths[i%len(paths)], identifier); err != nil { pprof.StopCPUProfile() - cpuFile.Close() + if cerr := cpuFile.Close(); cerr != nil { + t.Logf("close cpu profile after error: %v", cerr) + } t.Fatalf("AnalyzePath iter %d: %v", i, err) } } @@ -94,7 +98,9 @@ func TestProfileAnalyzePath(t *testing.T) { runtime.ReadMemStats(&after) pprof.StopCPUProfile() - cpuFile.Close() + if err := cpuFile.Close(); err != nil { + t.Fatalf("close cpu profile: %v", err) + } // Heap profile (alloc_space). memPath := filepath.Join(*profileOutDir, "mem.out") @@ -105,7 +111,9 @@ func TestProfileAnalyzePath(t *testing.T) { if err := pprof.Lookup("allocs").WriteTo(memFile, 0); err != nil { t.Fatalf("write mem profile: %v", err) } - memFile.Close() + if err := memFile.Close(); err != nil { + t.Fatalf("close mem profile: %v", err) + } // Goroutine snapshot (useful when debugging leaks). goPath := filepath.Join(*profileOutDir, "goroutine.out") @@ -116,7 +124,9 @@ func TestProfileAnalyzePath(t *testing.T) { if err := pprof.Lookup("goroutine").WriteTo(goFile, 0); err != nil { t.Fatalf("write goroutine profile: %v", err) } - goFile.Close() + if err := goFile.Close(); err != nil { + t.Fatalf("close goroutine profile: %v", err) + } totalBytes := after.TotalAlloc - before.TotalAlloc totalMallocs := after.Mallocs - before.Mallocs @@ -135,7 +145,10 @@ func TestProfileAnalyzePath(t *testing.T) { // new SegmentNode. func BenchmarkAnalyzePathWarm(b *testing.B) { paths := generateMixedPaths(10000, 0) - analyzer := dynamicpathdetector.NewPathAnalyzer(100) + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs( + dynamicpathdetector.OpenDynamicThreshold, + dynamicpathdetector.DefaultCollapseConfigs(), + ) identifier := "warm" for _, p := range paths { if _, err := analyzer.AnalyzePath(p, identifier); err != nil { @@ -162,12 +175,12 @@ func BenchmarkAnalyzePathCold(b *testing.B) { b.ReportAllocs() b.ResetTimer() for i := 0; i < b.N; i++ { - analyzer := dynamicpathdetector.NewPathAnalyzer(100) + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs( + dynamicpathdetector.OpenDynamicThreshold, + dynamicpathdetector.DefaultCollapseConfigs(), + ) if _, err := analyzer.AnalyzePath(paths[i%len(paths)], identifier); err != nil { b.Fatalf("AnalyzePath: %v", err) } } } - -// Ensure fmt is kept imported when future debugging prints land here. -var _ = fmt.Sprint diff --git a/pkg/registry/file/dynamicpathdetector/types.go b/pkg/registry/file/dynamicpathdetector/types.go index b581dee90..251c77f29 100644 --- a/pkg/registry/file/dynamicpathdetector/types.go +++ b/pkg/registry/file/dynamicpathdetector/types.go @@ -26,16 +26,24 @@ type CollapseConfig struct { Threshold int } -// DefaultCollapseConfigs carries the per-prefix thresholds we've found +// defaultCollapseConfigs carries the per-prefix thresholds we've found // useful in practice. These are the defaults wired into AnalyzeOpens; a // caller can pass a different slice via NewPathAnalyzerWithConfigs if -// they want to tune for their workload. -var DefaultCollapseConfigs = []CollapseConfig{ +// they want to tune for their workload. Unexported so callers cannot +// mutate the package-level slice — access via DefaultCollapseConfigs(). +var defaultCollapseConfigs = []CollapseConfig{ {Prefix: "/etc", Threshold: 100}, {Prefix: "/etc/apache2", Threshold: 50}, // tuned for the webapp standard test {Prefix: "/opt", Threshold: 50}, {Prefix: "/var/run", Threshold: 50}, - {Prefix: "/app", Threshold: 50}, // any variation under /app collapses immediately + {Prefix: "/app", Threshold: 50}, // any variation under /app collapses at 50 unique children +} + +// DefaultCollapseConfigs returns a defensive copy of the package-level +// default per-prefix collapse thresholds. Callers that mutate the result +// will not affect the package state or other callers. +func DefaultCollapseConfigs() []CollapseConfig { + return append([]CollapseConfig(nil), defaultCollapseConfigs...) } // DefaultCollapseConfig is the global fallback used when no CollapseConfig From acdc3e5aa3c65a19982c77754ed2eb2b7e2e317b Mon Sep 17 00:00:00 2001 From: entlein Date: Fri, 1 May 2026 19:19:07 +0200 Subject: [PATCH 16/16] addressing the defensive implementation for FindConfigForPath as raised by the Rabbit Signed-off-by: entlein --- .../file/dynamicpathdetector/analyzer.go | 30 +++++++------- .../tests/analyze_opens_test.go | 39 ++++++++++++++++++- 2 files changed, 55 insertions(+), 14 deletions(-) diff --git a/pkg/registry/file/dynamicpathdetector/analyzer.go b/pkg/registry/file/dynamicpathdetector/analyzer.go index 74e31aca4..b4562ef76 100644 --- a/pkg/registry/file/dynamicpathdetector/analyzer.go +++ b/pkg/registry/file/dynamicpathdetector/analyzer.go @@ -432,27 +432,31 @@ func compareSegments(dynamic, regular []string) bool { return false } -// FindConfigForPath returns the CollapseConfig whose Prefix matches -// `path` with the longest match, or nil if none match. Exposed so -// callers and tests can introspect which threshold will apply to a -// given path without walking the trie. -func (ua *PathAnalyzer) FindConfigForPath(path string) *CollapseConfig { - var best *CollapseConfig +// FindConfigForPath returns a value copy of the CollapseConfig whose +// Prefix matches `path` with the longest match. Falls back to the +// analyzer's default config (Prefix:"/") when no per-prefix override +// applies, so the result is always meaningful — there is no "no match" +// signal. +// +// Returning by value keeps the analyzer's internal state immutable +// from callers. NewPathAnalyzerWithConfigs already makes a defensive +// inbound copy of `configs`; this is its outbound twin. Without it, +// `cfg := analyzer.FindConfigForPath(p); cfg.Threshold = 1` would +// silently mutate the analyzer's threshold map for every future call. +func (ua *PathAnalyzer) FindConfigForPath(path string) CollapseConfig { + bestIdx := -1 bestLen := -1 for i := range ua.configs { cfg := &ua.configs[i] if hasPrefixAtBoundary(path, cfg.Prefix) && len(cfg.Prefix) > bestLen { - best = cfg + bestIdx = i bestLen = len(cfg.Prefix) } } - // Fall back to the `/` default config if no per-prefix override - // matched — callers expect a non-nil result when *any* threshold - // applies, and the default always applies. - if best == nil { - return &ua.defaultCfg + if bestIdx == -1 { + return ua.defaultCfg } - return best + return ua.configs[bestIdx] } // CollapseAdjacentDynamicIdentifiers replaces runs of adjacent diff --git a/pkg/registry/file/dynamicpathdetector/tests/analyze_opens_test.go b/pkg/registry/file/dynamicpathdetector/tests/analyze_opens_test.go index d2a83b11f..2c55e7d4d 100644 --- a/pkg/registry/file/dynamicpathdetector/tests/analyze_opens_test.go +++ b/pkg/registry/file/dynamicpathdetector/tests/analyze_opens_test.go @@ -924,7 +924,6 @@ func TestFindConfigForPath(t *testing.T) { for _, tt := range tests { t.Run(tt.path, func(t *testing.T) { config := analyzer.FindConfigForPath(tt.path) - assert.NotNil(t, config, "config should not be nil for path %q", tt.path) assert.Equal(t, tt.expectedPrefix, config.Prefix, "path %q should match prefix %q", tt.path, tt.expectedPrefix) assert.Equal(t, tt.expectedThreshold, config.Threshold, @@ -932,3 +931,41 @@ func TestFindConfigForPath(t *testing.T) { }) } } + +// TestFindConfigForPath_DefensiveCopy pins the contract that the +// returned CollapseConfig is a value copy, so mutating it does NOT +// alter the analyzer's internal state. Same defensive philosophy as +// DefaultCollapseConfigs(): the caller cannot accidentally turn an +// introspection helper into a hidden mutator. +func TestFindConfigForPath_DefensiveCopy(t *testing.T) { + analyzer := dynamicpathdetector.NewPathAnalyzerWithConfigs( + dynamicpathdetector.OpenDynamicThreshold, + dynamicpathdetector.DefaultCollapseConfigs(), + ) + + // Per-prefix override path: snapshot, mutate the copy, look up + // again, expect the analyzer to be unaffected. + first := analyzer.FindConfigForPath("/etc/file") + originalThreshold := first.Threshold + first.Threshold = 999_999 + first.Prefix = "/poisoned" + + second := analyzer.FindConfigForPath("/etc/file") + assert.Equal(t, originalThreshold, second.Threshold, + "mutating the first call's CollapseConfig must not change the analyzer state") + assert.NotEqual(t, "/poisoned", second.Prefix, + "mutating the first call's Prefix must not leak into the second call") + + // Default-fallback path: same contract for the path that doesn't + // match any per-prefix override. + firstDefault := analyzer.FindConfigForPath("/no/match/anywhere") + originalDefaultThreshold := firstDefault.Threshold + firstDefault.Threshold = 12345 + firstDefault.Prefix = "/poisoned-default" + + secondDefault := analyzer.FindConfigForPath("/no/match/anywhere") + assert.Equal(t, originalDefaultThreshold, secondDefault.Threshold, + "mutating the default config copy must not change the analyzer state") + assert.NotEqual(t, "/poisoned-default", secondDefault.Prefix, + "mutating the default config Prefix must not leak into a future call") +}