diff --git a/tests/tools/test_approval.py b/tests/tools/test_approval.py index 2d7bfe6b0a8..ee3a1370a9f 100644 --- a/tests/tools/test_approval.py +++ b/tests/tools/test_approval.py @@ -837,3 +837,126 @@ def test_safe_chmod_without_execute_not_flagged(self): dangerous, _, _ = detect_dangerous_command(cmd) assert dangerous is False + +class TestShellTokenizationBypass: + """Advisories GHSA-xvmp-c27r-qw3c and GHSA-pwx6-3q74-cxwr reported four + shell-tokenization tricks that bypass the literal-string regex layer. + ``_expand_shell_obfuscations`` reverses each of them so the underlying + command name becomes visible to ``DANGEROUS_PATTERNS``. + """ + + # --- empty-quote pair bypass ----------------------------------------- + + def test_empty_double_quote_pair_bypass_detected(self): + cmd = 'r""m -rf /tmp/foo' + dangerous, _, _ = detect_dangerous_command(cmd) + assert dangerous is True + + def test_empty_single_quote_pair_bypass_detected(self): + cmd = "r''m -rf /tmp/foo" + dangerous, _, _ = detect_dangerous_command(cmd) + assert dangerous is True + + # --- ${IFS} / $IFS expansion bypass ----------------------------------- + + def test_ifs_brace_expansion_bypass_detected(self): + cmd = 'rm${IFS}-rf${IFS}/tmp/foo' + dangerous, _, _ = detect_dangerous_command(cmd) + assert dangerous is True + + def test_ifs_bare_expansion_bypass_detected(self): + cmd = 'rm$IFS-rf$IFS/tmp/foo' + dangerous, _, _ = detect_dangerous_command(cmd) + assert dangerous is True + + def test_ifs_chmod_bypass_detected(self): + cmd = 'chmod${IFS}777${IFS}/etc/shadow' + dangerous, _, _ = detect_dangerous_command(cmd) + assert dangerous is True + + # --- $(...) / backtick command substitution bypass ------------------- + + def test_cmd_subst_script_exec_bypass_detected(self): + cmd = "$(echo python3) -c 'print(1)'" + dangerous, _, _ = detect_dangerous_command(cmd) + assert dangerous is True + + def test_cmd_subst_quoted_identifier_bypass_detected(self): + cmd = "$(echo 'rm') -rf /tmp/foo" + dangerous, _, _ = detect_dangerous_command(cmd) + assert dangerous is True + + def test_cmd_subst_double_quoted_identifier_bypass_detected(self): + cmd = '$(echo "rm") -rf /tmp/foo' + dangerous, _, _ = detect_dangerous_command(cmd) + assert dangerous is True + + def test_backtick_substitution_bypass_detected(self): + cmd = '`echo rm` -rf /tmp/foo' + dangerous, _, _ = detect_dangerous_command(cmd) + assert dangerous is True + + def test_nested_cmd_substitution_bypass_detected(self): + cmd = '$(echo $(echo rm)) -rf /tmp/foo' + dangerous, _, _ = detect_dangerous_command(cmd) + assert dangerous is True + + def test_cmd_subst_git_destructive_bypass_detected(self): + cmd = "$(echo 'git') reset --hard HEAD" + dangerous, _, _ = detect_dangerous_command(cmd) + assert dangerous is True + + # --- ANSI-C quoting bypass -------------------------------------------- + + def test_ansi_c_quote_hex_bypass_detected(self): + # $'\x72\x6d' = 'rm' + cmd = "$'\\x72\\x6d' -rf /tmp/foo" + dangerous, _, _ = detect_dangerous_command(cmd) + assert dangerous is True + + def test_ansi_c_quote_plus_ifs_combined_bypass_detected(self): + cmd = "$'\\x72\\x6d'${IFS}-rf${IFS}/tmp" + dangerous, _, _ = detect_dangerous_command(cmd) + assert dangerous is True + + # --- legitimate commands MUST NOT false-positive -------------------- + + def test_quoted_log_message_with_rm_not_flagged(self): + """A quoted string containing 'rm' as a word should not trigger.""" + cmd = 'echo "this is a message mentioning rm here"' + dangerous, _, _ = detect_dangerous_command(cmd) + assert dangerous is False + + def test_grep_with_quoted_pattern_not_flagged(self): + cmd = "grep 'pattern' file.txt" + dangerous, _, _ = detect_dangerous_command(cmd) + assert dangerous is False + + def test_git_commit_with_quoted_message_not_flagged(self): + cmd = "git commit -m 'fix: update'" + dangerous, _, _ = detect_dangerous_command(cmd) + assert dangerous is False + + def test_curl_with_quoted_header_not_flagged(self): + cmd = 'curl -H "Content-Type: application/json" https://api.example.com' + dangerous, _, _ = detect_dangerous_command(cmd) + assert dangerous is False + + def test_echo_with_variable_expansion_not_flagged(self): + """Variable expansion that isn't ${IFS} must not be substituted.""" + cmd = 'echo ${PATH}' + dangerous, _, _ = detect_dangerous_command(cmd) + assert dangerous is False + + # --- regression: structural patterns still fire on literal form ------ + + def test_structural_pgrep_still_detected(self): + """kill $(pgrep ...) must still match the literal structural pattern — + this relies on the base-normalized form still containing $(, not just + the expanded form. + """ + cmd = 'kill -9 $(pgrep -f "hermes.*gateway")' + dangerous, _, desc = detect_dangerous_command(cmd) + assert dangerous is True + assert "pgrep" in desc.lower() + diff --git a/tools/approval.py b/tools/approval.py index fc344bd77b7..f5dd191f2b9 100644 --- a/tools/approval.py +++ b/tools/approval.py @@ -166,12 +166,56 @@ def _approval_key_aliases(pattern_key: str) -> set[str]: # Detection # ========================================================================= +# Shell tokens that the shell removes or substitutes during tokenization but +# that the literal-string regex layer still sees as opaque characters. These +# are deterministic bypasses of the pattern-based detection (the shell will +# always strip/substitute them), so we apply the same transform here before +# matching. +_EMPTY_QUOTE_RE = re.compile(r"''|\"\"") +_IFS_EXPANSION_RE = re.compile(r'\$\{IFS\}|\$IFS\b', re.IGNORECASE) +# $'...' is bash ANSI-C quoting — \xNN, \NNN (octal), \cX, \uNNNN sequences +# expand to literal bytes at exec time. We decode only the \xNN form here +# because that's the documented bypass; other escapes (\n, \t) don't help an +# attacker hide a pattern match. +_ANSI_C_QUOTE_RE = re.compile(r"\$'((?:[^'\\]|\\.)*)'") +# Command substitution $(cmd args) — the outer command's name can be hidden +# behind an expansion. We rescan the inner command so that +# `$(echo rm) -rf /tmp` is recognised via the inner `rm`. Nested substitutions +# are handled by iterating until stable. +_CMD_SUBST_RE = re.compile(r'\$\(([^()]*)\)') +_BACKTICK_SUBST_RE = re.compile(r'`([^`]*)`') +# Quoted identifier tokens — 'rm', "rm", etc. The shell strips the quotes at +# tokenize time, leaving a bare identifier. Only strip when the body is a +# single shell-safe identifier so we don't mangle legit quoted strings that +# contain spaces or shell metacharacters. +_QUOTED_IDENT_RE = re.compile(r"""(?x) + (?:'([A-Za-z0-9_\-./+]+)'|"([A-Za-z0-9_\-./+]+)") +""") + + +def _decode_ansi_c_body(body: str) -> str: + """Expand \\xNN escapes inside a $'...' ANSI-C-quoted body.""" + def _sub(m): + try: + return chr(int(m.group(1), 16)) + except ValueError: + return m.group(0) + return re.sub(r'\\x([0-9a-fA-F]{2})', _sub, body) + + def _normalize_command_for_detection(command: str) -> str: """Normalize a command string before dangerous-pattern matching. Strips ANSI escape sequences (full ECMA-48 via tools.ansi_strip), - null bytes, and normalizes Unicode fullwidth characters so that - obfuscation techniques cannot bypass the pattern-based detection. + null bytes, and normalizes Unicode fullwidth characters. + + Note: This function does NOT expand shell-tokenization obfuscations + (``${IFS}``, ``$'\\xNN'``, ``$(cmd)``). Those are handled by the + complementary ``_expand_shell_obfuscations`` pass, which is run + alongside this one — some DANGEROUS_PATTERNS entries target the + literal substituted form (e.g. ``kill $(pgrep ...)`` as a + self-termination structural signal) while others need the expanded + form. ``detect_dangerous_command`` matches against both. """ from tools.ansi_strip import strip_ansi @@ -184,17 +228,64 @@ def _normalize_command_for_detection(command: str) -> str: return command +def _expand_shell_obfuscations(command: str) -> str: + """Reverse shell-tokenization obfuscations that would defeat literal-string + regex matching: empty quote pairs, quoted identifier tokens, ``${IFS}`` + expansion, ``$'\\xNN'`` ANSI-C quoting, and ``$(cmd)`` / backtick + command substitution. + + The shell strips/substitutes these at tokenize time, so a regex that + matches ``rm`` against the literal string ``r""m`` or ``rm${IFS}-rf`` + will miss them — the shell nonetheless executes ``rm``. We apply the + same transform here so detection sees what the shell will see. + + This pass is complementary to ``_normalize_command_for_detection`` — + ``detect_dangerous_command`` matches against both. + """ + # Empty quote pairs (r""m, f''oo) collapse to nothing at tokenize time. + command = _EMPTY_QUOTE_RE.sub('', command) + # ${IFS} / $IFS expands to whitespace. Use a single space — sufficient + # for \b word-boundary matches in DANGEROUS_PATTERNS. + command = _IFS_EXPANSION_RE.sub(' ', command) + # $'...' ANSI-C quoting: decode \xNN so `$'\x72\x6d' -rf` resolves to `rm -rf` + command = _ANSI_C_QUOTE_RE.sub(lambda m: _decode_ansi_c_body(m.group(1)), command) + # $(cmd args) / `cmd args` command substitution: inline the inner body so + # the outer command's name becomes visible to the regex. Loop to cover + # nesting, with a small bound to prevent pathological input from running + # unboundedly. + for _ in range(4): + new = _CMD_SUBST_RE.sub(lambda m: ' ' + m.group(1) + ' ', command) + new = _BACKTICK_SUBST_RE.sub(lambda m: ' ' + m.group(1) + ' ', new) + if new == command: + break + command = new + # Quoted identifier tokens (shell strips the quotes at tokenize time). + # Run AFTER command substitution so the inlined bodies also get unquoted. + command = _QUOTED_IDENT_RE.sub(lambda m: m.group(1) or m.group(2), command) + return command + + def detect_dangerous_command(command: str) -> tuple: """Check if a command matches any dangerous patterns. + Matches against BOTH the base-normalized form (preserves literal + ``$(...)`` / backtick / ``${IFS}`` so structural patterns like + ``kill $(pgrep ...)`` still fire) AND the expanded form (so that + obfuscations like ``r""m -rf`` or ``$(echo rm) -rf`` also fire via + the inner command name). + Returns: (is_dangerous, pattern_key, description) or (False, None, None) """ - command_lower = _normalize_command_for_detection(command).lower() + base = _normalize_command_for_detection(command).lower() + expanded = _expand_shell_obfuscations(base) for pattern, description in DANGEROUS_PATTERNS: - if re.search(pattern, command_lower, re.IGNORECASE | re.DOTALL): - pattern_key = description - return (True, pattern_key, description) + if re.search(pattern, base, re.IGNORECASE | re.DOTALL): + return (True, description, description) + if expanded != base and re.search( + pattern, expanded, re.IGNORECASE | re.DOTALL + ): + return (True, description, description) return (False, None, None)