Skip to content
Closed

. #13476

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 123 additions & 0 deletions tests/tools/test_approval.py
Original file line number Diff line number Diff line change
Expand Up @@ -837,3 +837,126 @@ def test_safe_chmod_without_execute_not_flagged(self):
dangerous, _, _ = detect_dangerous_command(cmd)
assert dangerous is False


class TestShellTokenizationBypass:
"""Advisories GHSA-xvmp-c27r-qw3c and GHSA-pwx6-3q74-cxwr reported four
shell-tokenization tricks that bypass the literal-string regex layer.
``_expand_shell_obfuscations`` reverses each of them so the underlying
command name becomes visible to ``DANGEROUS_PATTERNS``.
"""

# --- empty-quote pair bypass -----------------------------------------

def test_empty_double_quote_pair_bypass_detected(self):
cmd = 'r""m -rf /tmp/foo'
dangerous, _, _ = detect_dangerous_command(cmd)
assert dangerous is True
Comment on lines +850 to +853
Copy link
Copy Markdown

@latekvo latekvo Apr 21, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't it better to deny access based on something like ptrace? Then you don't have to worry about complex regex or edge cases like this one.
It would be more pedantic than current approaches, but i'm not sure if that's a bad thing.
The user either allows for access to files outside the project or not, if it's the latter then i don't think every case can be handled perfectly, it could be better to default to rejecting such removals/edits.


def test_empty_single_quote_pair_bypass_detected(self):
cmd = "r''m -rf /tmp/foo"
dangerous, _, _ = detect_dangerous_command(cmd)
assert dangerous is True

# --- ${IFS} / $IFS expansion bypass -----------------------------------

def test_ifs_brace_expansion_bypass_detected(self):
cmd = 'rm${IFS}-rf${IFS}/tmp/foo'
dangerous, _, _ = detect_dangerous_command(cmd)
assert dangerous is True

def test_ifs_bare_expansion_bypass_detected(self):
cmd = 'rm$IFS-rf$IFS/tmp/foo'
dangerous, _, _ = detect_dangerous_command(cmd)
assert dangerous is True

def test_ifs_chmod_bypass_detected(self):
cmd = 'chmod${IFS}777${IFS}/etc/shadow'
dangerous, _, _ = detect_dangerous_command(cmd)
assert dangerous is True

# --- $(...) / backtick command substitution bypass -------------------

def test_cmd_subst_script_exec_bypass_detected(self):
cmd = "$(echo python3) -c 'print(1)'"
dangerous, _, _ = detect_dangerous_command(cmd)
assert dangerous is True

def test_cmd_subst_quoted_identifier_bypass_detected(self):
cmd = "$(echo 'rm') -rf /tmp/foo"
dangerous, _, _ = detect_dangerous_command(cmd)
assert dangerous is True

def test_cmd_subst_double_quoted_identifier_bypass_detected(self):
cmd = '$(echo "rm") -rf /tmp/foo'
dangerous, _, _ = detect_dangerous_command(cmd)
assert dangerous is True

def test_backtick_substitution_bypass_detected(self):
cmd = '`echo rm` -rf /tmp/foo'
dangerous, _, _ = detect_dangerous_command(cmd)
assert dangerous is True

def test_nested_cmd_substitution_bypass_detected(self):
cmd = '$(echo $(echo rm)) -rf /tmp/foo'
dangerous, _, _ = detect_dangerous_command(cmd)
assert dangerous is True

def test_cmd_subst_git_destructive_bypass_detected(self):
cmd = "$(echo 'git') reset --hard HEAD"
dangerous, _, _ = detect_dangerous_command(cmd)
assert dangerous is True

# --- ANSI-C quoting bypass --------------------------------------------

def test_ansi_c_quote_hex_bypass_detected(self):
# $'\x72\x6d' = 'rm'
cmd = "$'\\x72\\x6d' -rf /tmp/foo"
dangerous, _, _ = detect_dangerous_command(cmd)
assert dangerous is True

def test_ansi_c_quote_plus_ifs_combined_bypass_detected(self):
cmd = "$'\\x72\\x6d'${IFS}-rf${IFS}/tmp"
dangerous, _, _ = detect_dangerous_command(cmd)
assert dangerous is True

# --- legitimate commands MUST NOT false-positive --------------------

def test_quoted_log_message_with_rm_not_flagged(self):
"""A quoted string containing 'rm' as a word should not trigger."""
cmd = 'echo "this is a message mentioning rm here"'
dangerous, _, _ = detect_dangerous_command(cmd)
assert dangerous is False

def test_grep_with_quoted_pattern_not_flagged(self):
cmd = "grep 'pattern' file.txt"
dangerous, _, _ = detect_dangerous_command(cmd)
assert dangerous is False

def test_git_commit_with_quoted_message_not_flagged(self):
cmd = "git commit -m 'fix: update'"
dangerous, _, _ = detect_dangerous_command(cmd)
assert dangerous is False

def test_curl_with_quoted_header_not_flagged(self):
cmd = 'curl -H "Content-Type: application/json" https://api.example.com'
dangerous, _, _ = detect_dangerous_command(cmd)
assert dangerous is False

def test_echo_with_variable_expansion_not_flagged(self):
"""Variable expansion that isn't ${IFS} must not be substituted."""
cmd = 'echo ${PATH}'
dangerous, _, _ = detect_dangerous_command(cmd)
assert dangerous is False

# --- regression: structural patterns still fire on literal form ------

def test_structural_pgrep_still_detected(self):
"""kill $(pgrep ...) must still match the literal structural pattern —
this relies on the base-normalized form still containing $(, not just
the expanded form.
"""
cmd = 'kill -9 $(pgrep -f "hermes.*gateway")'
dangerous, _, desc = detect_dangerous_command(cmd)
assert dangerous is True
assert "pgrep" in desc.lower()

103 changes: 97 additions & 6 deletions tools/approval.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,12 +166,56 @@ def _approval_key_aliases(pattern_key: str) -> set[str]:
# Detection
# =========================================================================

# Shell tokens that the shell removes or substitutes during tokenization but
# that the literal-string regex layer still sees as opaque characters. These
# are deterministic bypasses of the pattern-based detection (the shell will
# always strip/substitute them), so we apply the same transform here before
# matching.
_EMPTY_QUOTE_RE = re.compile(r"''|\"\"")
_IFS_EXPANSION_RE = re.compile(r'\$\{IFS\}|\$IFS\b', re.IGNORECASE)
# $'...' is bash ANSI-C quoting — \xNN, \NNN (octal), \cX, \uNNNN sequences
# expand to literal bytes at exec time. We decode only the \xNN form here
# because that's the documented bypass; other escapes (\n, \t) don't help an
# attacker hide a pattern match.
_ANSI_C_QUOTE_RE = re.compile(r"\$'((?:[^'\\]|\\.)*)'")
# Command substitution $(cmd args) — the outer command's name can be hidden
# behind an expansion. We rescan the inner command so that
# `$(echo rm) -rf /tmp` is recognised via the inner `rm`. Nested substitutions
# are handled by iterating until stable.
_CMD_SUBST_RE = re.compile(r'\$\(([^()]*)\)')
_BACKTICK_SUBST_RE = re.compile(r'`([^`]*)`')
# Quoted identifier tokens — 'rm', "rm", etc. The shell strips the quotes at
# tokenize time, leaving a bare identifier. Only strip when the body is a
# single shell-safe identifier so we don't mangle legit quoted strings that
# contain spaces or shell metacharacters.
_QUOTED_IDENT_RE = re.compile(r"""(?x)
(?:'([A-Za-z0-9_\-./+]+)'|"([A-Za-z0-9_\-./+]+)")
""")


def _decode_ansi_c_body(body: str) -> str:
"""Expand \\xNN escapes inside a $'...' ANSI-C-quoted body."""
def _sub(m):
try:
return chr(int(m.group(1), 16))
except ValueError:
return m.group(0)
return re.sub(r'\\x([0-9a-fA-F]{2})', _sub, body)


def _normalize_command_for_detection(command: str) -> str:
"""Normalize a command string before dangerous-pattern matching.

Strips ANSI escape sequences (full ECMA-48 via tools.ansi_strip),
null bytes, and normalizes Unicode fullwidth characters so that
obfuscation techniques cannot bypass the pattern-based detection.
null bytes, and normalizes Unicode fullwidth characters.

Note: This function does NOT expand shell-tokenization obfuscations
(``${IFS}``, ``$'\\xNN'``, ``$(cmd)``). Those are handled by the
complementary ``_expand_shell_obfuscations`` pass, which is run
alongside this one — some DANGEROUS_PATTERNS entries target the
literal substituted form (e.g. ``kill $(pgrep ...)`` as a
self-termination structural signal) while others need the expanded
form. ``detect_dangerous_command`` matches against both.
"""
from tools.ansi_strip import strip_ansi

Expand All @@ -184,17 +228,64 @@ def _normalize_command_for_detection(command: str) -> str:
return command


def _expand_shell_obfuscations(command: str) -> str:
"""Reverse shell-tokenization obfuscations that would defeat literal-string
regex matching: empty quote pairs, quoted identifier tokens, ``${IFS}``
expansion, ``$'\\xNN'`` ANSI-C quoting, and ``$(cmd)`` / backtick
command substitution.

The shell strips/substitutes these at tokenize time, so a regex that
matches ``rm`` against the literal string ``r""m`` or ``rm${IFS}-rf``
will miss them — the shell nonetheless executes ``rm``. We apply the
same transform here so detection sees what the shell will see.

This pass is complementary to ``_normalize_command_for_detection`` —
``detect_dangerous_command`` matches against both.
"""
# Empty quote pairs (r""m, f''oo) collapse to nothing at tokenize time.
command = _EMPTY_QUOTE_RE.sub('', command)
# ${IFS} / $IFS expands to whitespace. Use a single space — sufficient
# for \b word-boundary matches in DANGEROUS_PATTERNS.
command = _IFS_EXPANSION_RE.sub(' ', command)
# $'...' ANSI-C quoting: decode \xNN so `$'\x72\x6d' -rf` resolves to `rm -rf`
command = _ANSI_C_QUOTE_RE.sub(lambda m: _decode_ansi_c_body(m.group(1)), command)
# $(cmd args) / `cmd args` command substitution: inline the inner body so
# the outer command's name becomes visible to the regex. Loop to cover
# nesting, with a small bound to prevent pathological input from running
# unboundedly.
for _ in range(4):
new = _CMD_SUBST_RE.sub(lambda m: ' ' + m.group(1) + ' ', command)
new = _BACKTICK_SUBST_RE.sub(lambda m: ' ' + m.group(1) + ' ', new)
if new == command:
break
command = new
# Quoted identifier tokens (shell strips the quotes at tokenize time).
# Run AFTER command substitution so the inlined bodies also get unquoted.
command = _QUOTED_IDENT_RE.sub(lambda m: m.group(1) or m.group(2), command)
return command


def detect_dangerous_command(command: str) -> tuple:
"""Check if a command matches any dangerous patterns.

Matches against BOTH the base-normalized form (preserves literal
``$(...)`` / backtick / ``${IFS}`` so structural patterns like
``kill $(pgrep ...)`` still fire) AND the expanded form (so that
obfuscations like ``r""m -rf`` or ``$(echo rm) -rf`` also fire via
the inner command name).

Returns:
(is_dangerous, pattern_key, description) or (False, None, None)
"""
command_lower = _normalize_command_for_detection(command).lower()
base = _normalize_command_for_detection(command).lower()
expanded = _expand_shell_obfuscations(base)
for pattern, description in DANGEROUS_PATTERNS:
if re.search(pattern, command_lower, re.IGNORECASE | re.DOTALL):
pattern_key = description
return (True, pattern_key, description)
if re.search(pattern, base, re.IGNORECASE | re.DOTALL):
return (True, description, description)
if expanded != base and re.search(
pattern, expanded, re.IGNORECASE | re.DOTALL
):
return (True, description, description)
return (False, None, None)


Expand Down
Loading