From 8021feed47627b9079b5857dd041c91f34efd266 Mon Sep 17 00:00:00 2001 From: lczyk Date: Mon, 4 May 2026 16:10:15 +0100 Subject: [PATCH 1/5] test!: regression coverage for `(?i:UPPER)` literal mismatch `inlined_memicmp` requires the `Condition::PrefixInsensitive` needle to be ascii-lowercase: it lowercases the haystack byte and compares it raw against the needle. `emit_literal` currently interns the literal verbatim, so any uppercase byte makes the prefix check mismatch unconditionally and the `if` branch silently never fires. three failing tests cover the cases: - single-literal `(?i:FOO)` against uppercase input - single-literal `(?i:FOO)` against lowercase input (sanity check that case-insensitivity itself works) - alternation `(?i:FROM|RUN|CMD)` against mixed-case inputs these will pass once `emit_literal` lowercases the needle when `case_insensitive` is set. failing on purpose -- hence `test!:`. --- crates/lsh/Cargo.toml | 3 + crates/lsh/tests/regex_case_insensitive.rs | 100 +++++++++++++++++++++ 2 files changed, 103 insertions(+) create mode 100644 crates/lsh/tests/regex_case_insensitive.rs diff --git a/crates/lsh/Cargo.toml b/crates/lsh/Cargo.toml index c079ef797f4..525efda5d71 100644 --- a/crates/lsh/Cargo.toml +++ b/crates/lsh/Cargo.toml @@ -9,3 +9,6 @@ rust-version.workspace = true [dependencies] stdext.workspace = true + +[dev-dependencies] +stdext.workspace = true diff --git a/crates/lsh/tests/regex_case_insensitive.rs b/crates/lsh/tests/regex_case_insensitive.rs new file mode 100644 index 00000000000..b0025633165 --- /dev/null +++ b/crates/lsh/tests/regex_case_insensitive.rs @@ -0,0 +1,100 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! Regression test for the `(?i:...)` literal bug. +//! +//! `inlined_memicmp` (in `runtime.rs`) requires the needle for +//! `Condition::PrefixInsensitive` to be ASCII-lowercase: it lowercases each +//! haystack byte and compares it raw against the needle. Before the fix in +//! `emit_literal`, the literal was interned verbatim, so any uppercase byte +//! made the prefix check mismatch unconditionally and the `if` branch +//! silently never fired. + +use lsh::compiler::Compiler; +use lsh::runtime::Runtime; +use stdext::arena::{self, Arena, scratch_arena}; + +fn ensure_scratch() { + use std::sync::Once; + static ONCE: Once = Once::new(); + ONCE.call_once(|| { + arena::init(1 << 20).unwrap(); + }); +} + +fn compile_and_run(src: &str, line: &[u8]) -> (Vec<(usize, u32)>, Vec) { + ensure_scratch(); + let arena = Arena::new(1 << 20).unwrap(); + let mut compiler = Compiler::new(&arena); + compiler.parse("test.lsh", src).unwrap(); + let assembly = compiler.assemble().unwrap(); + + let charsets: Vec<[u16; 16]> = assembly.charsets.iter().map(|c| c.serialize()).collect(); + let kind_names: Vec = { + let max = assembly.highlight_kinds.iter().map(|hk| hk.value).max().unwrap_or(0); + let mut names = vec![String::new(); max as usize + 1]; + for hk in &assembly.highlight_kinds { + names[hk.value as usize] = hk.identifier.to_string(); + } + names + }; + let entry = assembly.entrypoints[0].address as u32; + let mut runtime = + Runtime::new(&assembly.instructions, &assembly.strings, &charsets, entry); + + let scratch = scratch_arena(None); + let highlights = runtime.parse_next_line::(&scratch, line); + let pairs = highlights.iter().map(|h| (h.start, h.kind)).collect(); + (pairs, kind_names) +} + +fn assert_kind_present(src: &str, line: &[u8], expected_kind: &str) { + let (highlights, kind_names) = compile_and_run(src, line); + let present: Vec<&str> = highlights + .iter() + .filter_map(|(_, k)| kind_names.get(*k as usize).map(String::as_str)) + .collect(); + assert!( + present.contains(&expected_kind), + "expected `{expected_kind}` span on input {:?}, got kinds: {:?} (raw: {:?})", + std::str::from_utf8(line).unwrap_or(""), + present, + highlights, + ); +} + +const GRAMMAR: &str = r#" +#[display_name = "Test"] +#[path = "**/test.txt"] +pub fn test() { + if /(?i:FOO)\>/ { + yield keyword.control; + } +} +"#; + +#[test] +fn case_insensitive_uppercase_literal_matches_uppercase_input() { + assert_kind_present(GRAMMAR, b"FOO", "keyword.control"); +} + +#[test] +fn case_insensitive_uppercase_literal_matches_lowercase_input() { + assert_kind_present(GRAMMAR, b"foo", "keyword.control"); +} + +#[test] +fn case_insensitive_alternation_with_uppercase_alternatives() { + let src = r#" +#[display_name = "Test"] +#[path = "**/test.txt"] +pub fn test() { + if /(?i:FROM|RUN|CMD)\>/ { + yield keyword.control; + } +} +"#; + assert_kind_present(src, b"FROM", "keyword.control"); + assert_kind_present(src, b"RUN", "keyword.control"); + assert_kind_present(src, b"cmd", "keyword.control"); +} From 3ee5cd32deff32835b13e416ed8677f3a4ec8742 Mon Sep 17 00:00:00 2001 From: lczyk Date: Mon, 4 May 2026 16:10:54 +0100 Subject: [PATCH 2/5] fix(lsh): lowercase PrefixInsensitive needle in emit_literal `inlined_memicmp` requires the `Condition::PrefixInsensitive` needle to be ascii-lowercase: it lowercases the haystack byte and compares it raw against the needle. emit_literal interned the literal verbatim, so any uppercase byte in `(?i:...)` made the prefix check mismatch unconditionally and the `if` branch silently never fired. lowercase the needle when the case-insensitive flag is set and any byte is uppercase. zero-cost otherwise (no allocation when the literal is already lowercase). makes the regression tests added in the previous commit pass. bug latent in upstream: no current grammar happens to use uppercase in `(?i:...)` literals, so nothing visibly broke. dockerfile.lsh in a downstream fork tripped it (every dockerfile instruction is uppercase by convention). --- crates/lsh/src/compiler/regex.rs | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/crates/lsh/src/compiler/regex.rs b/crates/lsh/src/compiler/regex.rs index e2c6cd11f77..e69c9b70f4e 100644 --- a/crates/lsh/src/compiler/regex.rs +++ b/crates/lsh/src/compiler/regex.rs @@ -697,7 +697,16 @@ impl<'a, 'c> CodeGen<'a, 'c> { if s.is_empty() { return Ok(on_match); } - let s = self.compiler.intern_string(s); + // PrefixInsensitive needle must be ascii-lowercase: the runtime + // lowercases each haystack byte and compares against the needle as-is. + let owned; + let needle: &str = if case_insensitive && s.bytes().any(|b| b.is_ascii_uppercase()) { + owned = s.to_ascii_lowercase(); + &owned + } else { + s + }; + let s = self.compiler.intern_string(needle); let condition = if case_insensitive { Condition::PrefixInsensitive(s) } else { Condition::Prefix(s) }; let if_node = self.compiler.alloc_iri(IRI::If { condition, then: on_match }); From 1a188503aa5917245b1899b175f801fb93ad1c15 Mon Sep 17 00:00:00 2001 From: lczyk Date: Mon, 4 May 2026 16:21:13 +0100 Subject: [PATCH 3/5] chore(lsh): drop redundant stdext dev-dependency integration tests in `tests/` already see `[dependencies]` of the crate-under-test; no need to re-list `stdext` under `[dev-dependencies]`. mistakenly added in 8021fee. --- crates/lsh/Cargo.toml | 3 --- 1 file changed, 3 deletions(-) diff --git a/crates/lsh/Cargo.toml b/crates/lsh/Cargo.toml index 525efda5d71..c079ef797f4 100644 --- a/crates/lsh/Cargo.toml +++ b/crates/lsh/Cargo.toml @@ -9,6 +9,3 @@ rust-version.workspace = true [dependencies] stdext.workspace = true - -[dev-dependencies] -stdext.workspace = true From 486b1fff3e75c959a13216a6ca9d64b880b79e25 Mon Sep 17 00:00:00 2001 From: lczyk Date: Mon, 4 May 2026 16:23:49 +0100 Subject: [PATCH 4/5] test: move `(?i:UPPER)` regression tests inline into regex.rs dropped the `tests/regex_case_insensitive.rs` integration file in favour of a `#[cfg(test)] mod tests` block at the bottom of `regex.rs`. shrinks the PR diff and keeps the test next to the bug site (`emit_literal`). same coverage: single literal + alternation, mixed-case inputs. --- crates/lsh/src/compiler/regex.rs | 58 ++++++++++++ crates/lsh/tests/regex_case_insensitive.rs | 100 --------------------- 2 files changed, 58 insertions(+), 100 deletions(-) delete mode 100644 crates/lsh/tests/regex_case_insensitive.rs diff --git a/crates/lsh/src/compiler/regex.rs b/crates/lsh/src/compiler/regex.rs index e69c9b70f4e..7507b519f3b 100644 --- a/crates/lsh/src/compiler/regex.rs +++ b/crates/lsh/src/compiler/regex.rs @@ -786,3 +786,61 @@ pub fn parse<'a>( Ok((entry, codegen.captures)) } + +#[cfg(test)] +mod tests { + //! Regression for `(?i:UPPER)` literal silent-mismatch: + //! `inlined_memicmp` requires lowercase needle; pre-fix `emit_literal` + //! interned uppercase verbatim, killing the match unconditionally. + + use stdext::arena::{self, Arena, scratch_arena}; + + use crate::compiler::Compiler; + use crate::runtime::Runtime; + + fn matches(grammar: &str, input: &[u8]) -> bool { + static INIT: std::sync::Once = std::sync::Once::new(); + INIT.call_once(|| arena::init(1 << 20).unwrap()); + + let arena = Arena::new(1 << 20).unwrap(); + let mut compiler = Compiler::new(&arena); + compiler.parse("t.lsh", grammar).unwrap(); + let asm = compiler.assemble().unwrap(); + + let charsets: Vec<[u16; 16]> = asm.charsets.iter().map(|c| c.serialize()).collect(); + let kc = asm + .highlight_kinds + .iter() + .find(|h| h.identifier == "keyword.control") + .unwrap() + .value; + let entry = asm.entrypoints[0].address as u32; + let mut rt = Runtime::new(&asm.instructions, &asm.strings, &charsets, entry); + + let scratch = scratch_arena(None); + rt.parse_next_line::(&scratch, input).iter().any(|h| h.kind == kc) + } + + const SINGLE: &str = r#" +#[display_name = "T"] #[path = "**/t.txt"] +pub fn t() { if /(?i:FOO)\>/ { yield keyword.control; } } +"#; + + const ALT: &str = r#" +#[display_name = "T"] #[path = "**/t.txt"] +pub fn t() { if /(?i:FROM|RUN|CMD)\>/ { yield keyword.control; } } +"#; + + #[test] + fn case_insensitive_uppercase_literal() { + assert!(matches(SINGLE, b"FOO")); + assert!(matches(SINGLE, b"foo")); + } + + #[test] + fn case_insensitive_uppercase_alternation() { + assert!(matches(ALT, b"FROM")); + assert!(matches(ALT, b"RUN")); + assert!(matches(ALT, b"cmd")); + } +} diff --git a/crates/lsh/tests/regex_case_insensitive.rs b/crates/lsh/tests/regex_case_insensitive.rs deleted file mode 100644 index b0025633165..00000000000 --- a/crates/lsh/tests/regex_case_insensitive.rs +++ /dev/null @@ -1,100 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT License. - -//! Regression test for the `(?i:...)` literal bug. -//! -//! `inlined_memicmp` (in `runtime.rs`) requires the needle for -//! `Condition::PrefixInsensitive` to be ASCII-lowercase: it lowercases each -//! haystack byte and compares it raw against the needle. Before the fix in -//! `emit_literal`, the literal was interned verbatim, so any uppercase byte -//! made the prefix check mismatch unconditionally and the `if` branch -//! silently never fired. - -use lsh::compiler::Compiler; -use lsh::runtime::Runtime; -use stdext::arena::{self, Arena, scratch_arena}; - -fn ensure_scratch() { - use std::sync::Once; - static ONCE: Once = Once::new(); - ONCE.call_once(|| { - arena::init(1 << 20).unwrap(); - }); -} - -fn compile_and_run(src: &str, line: &[u8]) -> (Vec<(usize, u32)>, Vec) { - ensure_scratch(); - let arena = Arena::new(1 << 20).unwrap(); - let mut compiler = Compiler::new(&arena); - compiler.parse("test.lsh", src).unwrap(); - let assembly = compiler.assemble().unwrap(); - - let charsets: Vec<[u16; 16]> = assembly.charsets.iter().map(|c| c.serialize()).collect(); - let kind_names: Vec = { - let max = assembly.highlight_kinds.iter().map(|hk| hk.value).max().unwrap_or(0); - let mut names = vec![String::new(); max as usize + 1]; - for hk in &assembly.highlight_kinds { - names[hk.value as usize] = hk.identifier.to_string(); - } - names - }; - let entry = assembly.entrypoints[0].address as u32; - let mut runtime = - Runtime::new(&assembly.instructions, &assembly.strings, &charsets, entry); - - let scratch = scratch_arena(None); - let highlights = runtime.parse_next_line::(&scratch, line); - let pairs = highlights.iter().map(|h| (h.start, h.kind)).collect(); - (pairs, kind_names) -} - -fn assert_kind_present(src: &str, line: &[u8], expected_kind: &str) { - let (highlights, kind_names) = compile_and_run(src, line); - let present: Vec<&str> = highlights - .iter() - .filter_map(|(_, k)| kind_names.get(*k as usize).map(String::as_str)) - .collect(); - assert!( - present.contains(&expected_kind), - "expected `{expected_kind}` span on input {:?}, got kinds: {:?} (raw: {:?})", - std::str::from_utf8(line).unwrap_or(""), - present, - highlights, - ); -} - -const GRAMMAR: &str = r#" -#[display_name = "Test"] -#[path = "**/test.txt"] -pub fn test() { - if /(?i:FOO)\>/ { - yield keyword.control; - } -} -"#; - -#[test] -fn case_insensitive_uppercase_literal_matches_uppercase_input() { - assert_kind_present(GRAMMAR, b"FOO", "keyword.control"); -} - -#[test] -fn case_insensitive_uppercase_literal_matches_lowercase_input() { - assert_kind_present(GRAMMAR, b"foo", "keyword.control"); -} - -#[test] -fn case_insensitive_alternation_with_uppercase_alternatives() { - let src = r#" -#[display_name = "Test"] -#[path = "**/test.txt"] -pub fn test() { - if /(?i:FROM|RUN|CMD)\>/ { - yield keyword.control; - } -} -"#; - assert_kind_present(src, b"FROM", "keyword.control"); - assert_kind_present(src, b"RUN", "keyword.control"); - assert_kind_present(src, b"cmd", "keyword.control"); -} From 245347a4fbe8f31b42a24fae6e82f5ced2027c49 Mon Sep 17 00:00:00 2001 From: Leonard Hecker Date: Mon, 4 May 2026 20:15:11 +0200 Subject: [PATCH 5/5] Remove test, Use scratch for lowercasing --- crates/lsh/src/compiler/regex.rs | 74 ++++---------------------------- 1 file changed, 9 insertions(+), 65 deletions(-) diff --git a/crates/lsh/src/compiler/regex.rs b/crates/lsh/src/compiler/regex.rs index 7507b519f3b..0bac7718d4e 100644 --- a/crates/lsh/src/compiler/regex.rs +++ b/crates/lsh/src/compiler/regex.rs @@ -45,6 +45,7 @@ use std::slice; +use stdext::arena::scratch_arena; use stdext::collections::BVec; use super::*; @@ -697,16 +698,17 @@ impl<'a, 'c> CodeGen<'a, 'c> { if s.is_empty() { return Ok(on_match); } - // PrefixInsensitive needle must be ascii-lowercase: the runtime - // lowercases each haystack byte and compares against the needle as-is. - let owned; - let needle: &str = if case_insensitive && s.bytes().any(|b| b.is_ascii_uppercase()) { - owned = s.to_ascii_lowercase(); - &owned + + let scratch = scratch_arena(Some(self.compiler.arena)); + let s = if case_insensitive { + let mut lower = BString::from_str(&*scratch, s); + lower.make_ascii_lowercase(); + lower.leak() } else { s }; - let s = self.compiler.intern_string(needle); + + let s = self.compiler.intern_string(s); let condition = if case_insensitive { Condition::PrefixInsensitive(s) } else { Condition::Prefix(s) }; let if_node = self.compiler.alloc_iri(IRI::If { condition, then: on_match }); @@ -786,61 +788,3 @@ pub fn parse<'a>( Ok((entry, codegen.captures)) } - -#[cfg(test)] -mod tests { - //! Regression for `(?i:UPPER)` literal silent-mismatch: - //! `inlined_memicmp` requires lowercase needle; pre-fix `emit_literal` - //! interned uppercase verbatim, killing the match unconditionally. - - use stdext::arena::{self, Arena, scratch_arena}; - - use crate::compiler::Compiler; - use crate::runtime::Runtime; - - fn matches(grammar: &str, input: &[u8]) -> bool { - static INIT: std::sync::Once = std::sync::Once::new(); - INIT.call_once(|| arena::init(1 << 20).unwrap()); - - let arena = Arena::new(1 << 20).unwrap(); - let mut compiler = Compiler::new(&arena); - compiler.parse("t.lsh", grammar).unwrap(); - let asm = compiler.assemble().unwrap(); - - let charsets: Vec<[u16; 16]> = asm.charsets.iter().map(|c| c.serialize()).collect(); - let kc = asm - .highlight_kinds - .iter() - .find(|h| h.identifier == "keyword.control") - .unwrap() - .value; - let entry = asm.entrypoints[0].address as u32; - let mut rt = Runtime::new(&asm.instructions, &asm.strings, &charsets, entry); - - let scratch = scratch_arena(None); - rt.parse_next_line::(&scratch, input).iter().any(|h| h.kind == kc) - } - - const SINGLE: &str = r#" -#[display_name = "T"] #[path = "**/t.txt"] -pub fn t() { if /(?i:FOO)\>/ { yield keyword.control; } } -"#; - - const ALT: &str = r#" -#[display_name = "T"] #[path = "**/t.txt"] -pub fn t() { if /(?i:FROM|RUN|CMD)\>/ { yield keyword.control; } } -"#; - - #[test] - fn case_insensitive_uppercase_literal() { - assert!(matches(SINGLE, b"FOO")); - assert!(matches(SINGLE, b"foo")); - } - - #[test] - fn case_insensitive_uppercase_alternation() { - assert!(matches(ALT, b"FROM")); - assert!(matches(ALT, b"RUN")); - assert!(matches(ALT, b"cmd")); - } -}