diff --git a/base/README.md b/base/README.md index e0ade555..be25a113 100644 --- a/base/README.md +++ b/base/README.md @@ -40,3 +40,4 @@ Each new base program needs: | `vibix_libc_defs` | Shared type definitions | No (dep of std) | | `ld_vibix` | Dynamic linker | No (below std) | | `lib/` | Prebuilt shared objects (ld-musl stub) | N/A | +| `sh` | POSIX shell (`/bin/sh`) | Yes | diff --git a/base/sh/Cargo.toml b/base/sh/Cargo.toml new file mode 100644 index 00000000..92948aa0 --- /dev/null +++ b/base/sh/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "sh" +version = "0.1.0" +edition = "2021" +authors = ["vibix hackers"] +license = "MIT OR Apache-2.0" + +[[bin]] +name = "sh" +path = "src/main.rs" + +# Standalone package — not part of the main workspace. Built with +# `-Z build-std` against the in-repo std fork (see xtask build). +[workspace] diff --git a/base/sh/src/lexer.rs b/base/sh/src/lexer.rs new file mode 100644 index 00000000..b04c3226 --- /dev/null +++ b/base/sh/src/lexer.rs @@ -0,0 +1,911 @@ +//! POSIX shell lexer/tokenizer. +//! +//! Implements token recognition per POSIX.1-2024 section 2.3. The lexer +//! converts raw input bytes into a stream of [`Token`] values that a +//! parser can consume. +//! +//! ## Supported tokens +//! +//! - **Words**: unquoted, single-quoted, double-quoted, and +//! backslash-escaped character sequences. +//! - **Operators**: `|`, `||`, `&`, `&&`, `;`, `(`, `)`, `<`, `>`, +//! `>>`, `<<`, `>&`, `<&`, `<>`. +//! - **Newline**: preserved as a distinct token (significant in shell +//! grammar). +//! - **EOF**: signals end of input. +//! +//! ## Quoting +//! +//! - **Single quotes** preserve everything literally until the closing +//! `'`. A single quote cannot appear inside single quotes. +//! - **Double quotes** preserve most characters literally but recognize +//! `$`, `` ` ``, `\`, `!`, and `"` as special (per POSIX). Within +//! double quotes, backslash escapes only the characters `$`, `` ` ``, +//! `"`, `\`, and newline. +//! - **Backslash** outside quotes escapes the immediately following +//! character (including newline, which is a line continuation). + +use std::fmt; + +// ── Token type ────────────────────────────────────────────────────── + +/// A single lexical token produced by the shell lexer. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum Token { + /// A shell word (may include quoted segments merged together). + Word(String), + /// `|` + Pipe, + /// `||` + Or, + /// `&` + Ampersand, + /// `&&` + And, + /// `;` + Semi, + /// `(` + LParen, + /// `)` + RParen, + /// `<` + Less, + /// `>` + Great, + /// `>>` + DGreat, + /// `<<` + DLess, + /// `>&` + GreatAnd, + /// `<&` + LessAnd, + /// `<>` + LessGreat, + /// A newline character (syntactically significant in shell grammar). + Newline, + /// End of input. + Eof, +} + +impl fmt::Display for Token { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Token::Word(w) => write!(f, "Word({w:?})"), + Token::Pipe => write!(f, "Pipe"), + Token::Or => write!(f, "Or"), + Token::Ampersand => write!(f, "Ampersand"), + Token::And => write!(f, "And"), + Token::Semi => write!(f, "Semi"), + Token::LParen => write!(f, "LParen"), + Token::RParen => write!(f, "RParen"), + Token::Less => write!(f, "Less"), + Token::Great => write!(f, "Great"), + Token::DGreat => write!(f, "DGreat"), + Token::DLess => write!(f, "DLess"), + Token::GreatAnd => write!(f, "GreatAnd"), + Token::LessAnd => write!(f, "LessAnd"), + Token::LessGreat => write!(f, "LessGreat"), + Token::Newline => write!(f, "Newline"), + Token::Eof => write!(f, "Eof"), + } + } +} + +// ── Lexer errors ──────────────────────────────────────────────────── + +/// An error encountered during lexical analysis. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum LexError { + /// Unterminated single-quoted string. + UnterminatedSingleQuote, + /// Unterminated double-quoted string. + UnterminatedDoubleQuote, + /// Backslash at end of input (not followed by newline for continuation). + TrailingBackslash, +} + +impl fmt::Display for LexError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + LexError::UnterminatedSingleQuote => write!(f, "unterminated single quote"), + LexError::UnterminatedDoubleQuote => write!(f, "unterminated double quote"), + LexError::TrailingBackslash => write!(f, "trailing backslash"), + } + } +} + +// ── Lexer ─────────────────────────────────────────────────────────── + +/// Shell lexer that tokenizes input one token at a time. +/// +/// Create a `Lexer` with [`Lexer::new`] and call [`Lexer::next_token`] +/// repeatedly until [`Token::Eof`] is returned. +pub struct Lexer<'a> { + input: &'a [u8], + pos: usize, +} + +impl<'a> Lexer<'a> { + /// Create a new lexer over the given input. + pub fn new(input: &'a str) -> Self { + Self { + input: input.as_bytes(), + pos: 0, + } + } + + /// Return the next token, or an error if the input is malformed. + pub fn next_token(&mut self) -> Result { + self.skip_blanks(); + + match self.peek() { + None => Ok(Token::Eof), + Some(b'\n') => { + self.advance(); + Ok(Token::Newline) + } + Some(b'#') => { + self.skip_comment(); + self.next_token() + } + Some(c) if is_operator_start(c) => self.lex_operator(), + _ => self.lex_word(), + } + } + + /// Tokenize the entire input into a vector of tokens. + /// + /// Stops at [`Token::Eof`] (which is included in the result). + pub fn tokenize_all(&mut self) -> Result, LexError> { + let mut tokens = Vec::new(); + loop { + let tok = self.next_token()?; + let is_eof = tok == Token::Eof; + tokens.push(tok); + if is_eof { + break; + } + } + Ok(tokens) + } + + // ── helpers ───────────────────────────────────────────────────── + + fn peek(&self) -> Option { + self.input.get(self.pos).copied() + } + + fn advance(&mut self) -> Option { + let c = self.input.get(self.pos).copied(); + if c.is_some() { + self.pos += 1; + } + c + } + + fn skip_blanks(&mut self) { + while let Some(c) = self.peek() { + if c == b' ' || c == b'\t' { + self.advance(); + } else { + break; + } + } + } + + fn skip_comment(&mut self) { + // A comment starts with '#' and extends to (but not including) the + // next newline or EOF. + while let Some(c) = self.peek() { + if c == b'\n' { + break; + } + self.advance(); + } + } + + /// Lex an operator token. The current byte is known to be an operator + /// start character. + fn lex_operator(&mut self) -> Result { + let c = self.advance().unwrap(); + match c { + b'|' => { + if self.peek() == Some(b'|') { + self.advance(); + Ok(Token::Or) + } else { + Ok(Token::Pipe) + } + } + b'&' => { + if self.peek() == Some(b'&') { + self.advance(); + Ok(Token::And) + } else { + Ok(Token::Ampersand) + } + } + b';' => Ok(Token::Semi), + b'(' => Ok(Token::LParen), + b')' => Ok(Token::RParen), + b'<' => match self.peek() { + Some(b'<') => { + self.advance(); + Ok(Token::DLess) + } + Some(b'&') => { + self.advance(); + Ok(Token::LessAnd) + } + Some(b'>') => { + self.advance(); + Ok(Token::LessGreat) + } + _ => Ok(Token::Less), + }, + b'>' => match self.peek() { + Some(b'>') => { + self.advance(); + Ok(Token::DGreat) + } + Some(b'&') => { + self.advance(); + Ok(Token::GreatAnd) + } + _ => Ok(Token::Great), + }, + _ => unreachable!("lex_operator called on non-operator byte"), + } + } + + /// Lex a word token. Words are built from unquoted characters, single- + /// quoted strings, double-quoted strings, and backslash escapes that + /// are all concatenated together. + fn lex_word(&mut self) -> Result { + let mut word = String::new(); + loop { + match self.peek() { + None | Some(b'\n') => break, + Some(b' ') | Some(b'\t') => break, + Some(c) if is_operator_start(c) => break, + Some(b'#') if word.is_empty() => { + // A '#' at word start is a comment (handled in + // next_token), but mid-word it is literal. + unreachable!("comment should be handled before lex_word"); + } + Some(b'#') => { + // '#' in the middle of a word is literal. + self.advance(); + word.push('#'); + } + Some(b'\'') => self.lex_single_quote(&mut word)?, + Some(b'"') => self.lex_double_quote(&mut word)?, + Some(b'\\') => self.lex_backslash_unquoted(&mut word)?, + Some(c) => { + self.advance(); + word.push(c as char); + } + } + } + // A backslash-newline (line continuation) at the start of a word + // leaves the buffer empty. In that case, re-enter the main + // tokenizer loop rather than producing a spurious empty Word. + if word.is_empty() { + return self.next_token(); + } + Ok(Token::Word(word)) + } + + /// Consume a single-quoted string (opening `'` already peeked but + /// not consumed). Everything between the quotes is literal. + fn lex_single_quote(&mut self, word: &mut String) -> Result<(), LexError> { + self.advance(); // consume opening ' + loop { + match self.advance() { + Some(b'\'') => return Ok(()), + Some(c) => word.push(c as char), + None => return Err(LexError::UnterminatedSingleQuote), + } + } + } + + /// Consume a double-quoted string. Within double quotes, `$`, `` ` ``, + /// and `\` retain special meaning per POSIX. Backslash only escapes + /// `$`, `` ` ``, `"`, `\`, and newline inside double quotes. + fn lex_double_quote(&mut self, word: &mut String) -> Result<(), LexError> { + self.advance(); // consume opening " + loop { + match self.advance() { + Some(b'"') => return Ok(()), + Some(b'\\') => { + match self.peek() { + Some(b'$') | Some(b'`') | Some(b'"') | Some(b'\\') => { + let c = self.advance().unwrap(); + word.push(c as char); + } + Some(b'\n') => { + // Line continuation inside double quotes. + self.advance(); + } + _ => { + // Backslash is literal when not followed by a + // special character inside double quotes. + word.push('\\'); + } + } + } + Some(c) => word.push(c as char), + None => return Err(LexError::UnterminatedDoubleQuote), + } + } + } + + /// Consume a backslash escape outside of quotes. The backslash + /// escapes the next character; if followed by newline it is a line + /// continuation (neither character contributes to the token). + fn lex_backslash_unquoted(&mut self, word: &mut String) -> Result<(), LexError> { + self.advance(); // consume backslash + match self.advance() { + Some(b'\n') => { + // Line continuation — skip both the backslash and the + // newline. If the word buffer is empty so far, this + // effectively means nothing was contributed; the caller + // will continue scanning. + Ok(()) + } + Some(c) => { + word.push(c as char); + Ok(()) + } + None => Err(LexError::TrailingBackslash), + } + } +} + +/// Returns true if `c` can start a shell operator token. +fn is_operator_start(c: u8) -> bool { + matches!(c, b'|' | b'&' | b';' | b'(' | b')' | b'<' | b'>') +} + +// ── Tests ─────────────────────────────────────────────────────────── + +#[cfg(test)] +mod tests { + use super::*; + + /// Helper: tokenize input and return the token vec (panics on error). + fn toks(input: &str) -> Vec { + Lexer::new(input).tokenize_all().unwrap() + } + + /// Helper: tokenize and expect an error. + fn toks_err(input: &str) -> LexError { + Lexer::new(input).tokenize_all().unwrap_err() + } + + // ── Basic words ───────────────────────────────────────────────── + + #[test] + fn empty_input() { + assert_eq!(toks(""), vec![Token::Eof]); + } + + #[test] + fn single_word() { + assert_eq!(toks("hello"), vec![Token::Word("hello".into()), Token::Eof]); + } + + #[test] + fn multiple_words() { + assert_eq!( + toks("echo hello world"), + vec![ + Token::Word("echo".into()), + Token::Word("hello".into()), + Token::Word("world".into()), + Token::Eof, + ] + ); + } + + #[test] + fn tabs_and_spaces() { + assert_eq!( + toks(" a\t\tb "), + vec![ + Token::Word("a".into()), + Token::Word("b".into()), + Token::Eof, + ] + ); + } + + // ── Operators ─────────────────────────────────────────────────── + + #[test] + fn pipe() { + assert_eq!( + toks("a | b"), + vec![ + Token::Word("a".into()), + Token::Pipe, + Token::Word("b".into()), + Token::Eof, + ] + ); + } + + #[test] + fn or_operator() { + assert_eq!( + toks("a || b"), + vec![ + Token::Word("a".into()), + Token::Or, + Token::Word("b".into()), + Token::Eof, + ] + ); + } + + #[test] + fn ampersand() { + assert_eq!( + toks("a &"), + vec![Token::Word("a".into()), Token::Ampersand, Token::Eof] + ); + } + + #[test] + fn and_operator() { + assert_eq!( + toks("a && b"), + vec![ + Token::Word("a".into()), + Token::And, + Token::Word("b".into()), + Token::Eof, + ] + ); + } + + #[test] + fn semicolon() { + assert_eq!( + toks("a; b"), + vec![ + Token::Word("a".into()), + Token::Semi, + Token::Word("b".into()), + Token::Eof, + ] + ); + } + + #[test] + fn parens() { + assert_eq!( + toks("(a)"), + vec![ + Token::LParen, + Token::Word("a".into()), + Token::RParen, + Token::Eof, + ] + ); + } + + #[test] + fn redirections_basic() { + assert_eq!( + toks("a < in > out"), + vec![ + Token::Word("a".into()), + Token::Less, + Token::Word("in".into()), + Token::Great, + Token::Word("out".into()), + Token::Eof, + ] + ); + } + + #[test] + fn append_redirect() { + assert_eq!( + toks("a >> out"), + vec![ + Token::Word("a".into()), + Token::DGreat, + Token::Word("out".into()), + Token::Eof, + ] + ); + } + + #[test] + fn heredoc_operator() { + assert_eq!( + toks("cat << EOF"), + vec![ + Token::Word("cat".into()), + Token::DLess, + Token::Word("EOF".into()), + Token::Eof, + ] + ); + } + + #[test] + fn great_and() { + assert_eq!( + toks("a >& 2"), + vec![ + Token::Word("a".into()), + Token::GreatAnd, + Token::Word("2".into()), + Token::Eof, + ] + ); + } + + #[test] + fn less_and() { + assert_eq!( + toks("a <& 3"), + vec![ + Token::Word("a".into()), + Token::LessAnd, + Token::Word("3".into()), + Token::Eof, + ] + ); + } + + #[test] + fn less_great() { + assert_eq!( + toks("a <> file"), + vec![ + Token::Word("a".into()), + Token::LessGreat, + Token::Word("file".into()), + Token::Eof, + ] + ); + } + + #[test] + fn newlines() { + assert_eq!( + toks("a\nb\n"), + vec![ + Token::Word("a".into()), + Token::Newline, + Token::Word("b".into()), + Token::Newline, + Token::Eof, + ] + ); + } + + #[test] + fn operators_without_spaces() { + assert_eq!( + toks("a|b&&c;d"), + vec![ + Token::Word("a".into()), + Token::Pipe, + Token::Word("b".into()), + Token::And, + Token::Word("c".into()), + Token::Semi, + Token::Word("d".into()), + Token::Eof, + ] + ); + } + + // ── Single quoting ────────────────────────────────────────────── + + #[test] + fn single_quote_basic() { + assert_eq!( + toks("'hello world'"), + vec![Token::Word("hello world".into()), Token::Eof] + ); + } + + #[test] + fn single_quote_preserves_special_chars() { + assert_eq!( + toks("'a|b&&c;d>ee /tmp/out"), + vec![ + Token::Word("cat".into()), + Token::Word("/etc/passwd".into()), + Token::Pipe, + Token::Word("grep".into()), + Token::Word("root".into()), + Token::Great, + Token::Word("/tmp/out".into()), + Token::Eof, + ] + ); + } + + #[test] + fn realistic_compound() { + assert_eq!( + toks("mkdir -p dir && cd dir; echo done"), + vec![ + Token::Word("mkdir".into()), + Token::Word("-p".into()), + Token::Word("dir".into()), + Token::And, + Token::Word("cd".into()), + Token::Word("dir".into()), + Token::Semi, + Token::Word("echo".into()), + Token::Word("done".into()), + Token::Eof, + ] + ); + } + + #[test] + fn realistic_redirect_stderr() { + assert_eq!( + toks("cmd 2>&1"), + vec![ + Token::Word("cmd".into()), + Token::Word("2".into()), + Token::GreatAnd, + Token::Word("1".into()), + Token::Eof, + ] + ); + } + + #[test] + fn realistic_subshell() { + assert_eq!( + toks("(echo a; echo b)"), + vec![ + Token::LParen, + Token::Word("echo".into()), + Token::Word("a".into()), + Token::Semi, + Token::Word("echo".into()), + Token::Word("b".into()), + Token::RParen, + Token::Eof, + ] + ); + } + + #[test] + fn only_whitespace() { + assert_eq!(toks(" \t "), vec![Token::Eof]); + } + + #[test] + fn only_newlines() { + assert_eq!( + toks("\n\n"), + vec![Token::Newline, Token::Newline, Token::Eof] + ); + } + + #[test] + fn word_immediately_before_operator() { + assert_eq!( + toks("echo>file"), + vec![ + Token::Word("echo".into()), + Token::Great, + Token::Word("file".into()), + Token::Eof, + ] + ); + } +} diff --git a/base/sh/src/main.rs b/base/sh/src/main.rs new file mode 100644 index 00000000..fefccabf --- /dev/null +++ b/base/sh/src/main.rs @@ -0,0 +1,27 @@ +#![feature(restricted_std)] + +mod lexer; + +use lexer::{Lexer, Token}; + +fn main() { + // Placeholder: tokenize a hard-coded command line and print the + // tokens. A future issue will add interactive line reading and a + // parser/executor on top of this lexer. + let input = "echo hello world | cat\n"; + let mut lex = Lexer::new(input); + loop { + match lex.next_token() { + Ok(tok) => { + println!("{tok}"); + if tok == Token::Eof { + break; + } + } + Err(e) => { + eprintln!("sh: lex error: {e}"); + break; + } + } + } +} diff --git a/xtask/src/main.rs b/xtask/src/main.rs index a1fba4ee..4af0b997 100644 --- a/xtask/src/main.rs +++ b/xtask/src/main.rs @@ -793,6 +793,55 @@ fn build_userspace_std_hello() -> R { Ok(bin) } +/// Build the `/bin/sh` binary — the vibix POSIX shell. +/// +/// Uses the same out-of-tree `-Z build-std` approach as `std_hello`. +/// The crate lives in `base/sh/` (base system program, not a test). +/// +/// Not yet wired into `cargo xtask build` because the in-repo std fork +/// has a pre-existing compile error on the vibix target (E0034 in +/// `sys/thread/vibix.rs`). The function is ready to be called once that +/// is resolved. +#[allow(dead_code)] +fn build_userspace_sh() -> R { + let ws = workspace_root(); + let target_spec = ws.join(VIBIX_USERSPACE_TARGET); + let manifest = ws.join("base/sh/Cargo.toml"); + let library_root = ws.join("library"); + + let target_dir = ws.join("target"); + let mut cmd = Command::new("cargo"); + cmd.current_dir(&ws) + .env("__CARGO_TESTS_ONLY_SRC_ROOT", &library_root) + .args(["build", "--manifest-path"]) + .arg(&manifest) + .arg("--target-dir") + .arg(&target_dir) + .args([ + "-Z", + "build-std=std,core,alloc,panic_abort", + "-Z", + "build-std-features=compiler-builtins-mem", + "-Z", + "unstable-options", + "-Z", + "json-target-spec", + "--target", + ]) + .arg(&target_spec); + check(cmd.status()?)?; + + let bin = target_dir + .join("x86_64-unknown-vibix") + .join("debug") + .join("sh"); + if !bin.exists() { + return Err(format!("sh binary missing at {} after build", bin.display()).into()); + } + strip_debug(&bin)?; + Ok(bin) +} + /// Generate a minimal stub dynamic-linker ELF for the #764 integration test. /// /// Produces an ET_DYN ELF64 with a single page-aligned PT_LOAD segment.