use smartstring::{LazyCompact, SmartString}; use std::borrow::Cow; /// A utility for parsing shell-like command lines. /// /// The `Shellwords` struct takes an input string and allows extracting the command and its arguments. /// /// # Features /// /// - Parses command and arguments from input strings. /// - Supports single, double, and backtick quoted arguments. /// - Respects backslash escaping in arguments. /// /// # Examples /// /// Basic usage: /// /// ``` /// # use helix_core::shellwords::Shellwords; /// let shellwords = Shellwords::from(":o helix-core/src/shellwords.rs"); /// assert_eq!(":o", shellwords.command()); /// assert_eq!("helix-core/src/shellwords.rs", shellwords.args().next().unwrap()); /// ``` /// /// Empty command: /// /// ``` /// # use helix_core::shellwords::Shellwords; /// let shellwords = Shellwords::from(" "); /// assert!(shellwords.command().is_empty()); /// ``` /// /// # Iterator /// /// The `args` method returns a non-allocating iterator, `Args`, over the arguments of the input. /// /// ``` /// # use helix_core::shellwords::Shellwords; /// let shellwords = Shellwords::from(":o a b c"); /// let mut args = shellwords.args(); /// assert_eq!(Some("a"), args.next()); /// assert_eq!(Some("b"), args.next()); /// assert_eq!(Some("c"), args.next()); /// assert_eq!(None, args.next()); /// ``` #[derive(Clone, Copy)] pub struct Shellwords<'a> { input: &'a str, } impl<'a> From<&'a str> for Shellwords<'a> { #[inline] fn from(input: &'a str) -> Self { Self { input } } } impl<'a> From<&'a String> for Shellwords<'a> { #[inline] fn from(input: &'a String) -> Self { Self { input } } } impl<'a> From<&'a Cow<'a, str>> for Shellwords<'a> { #[inline] fn from(input: &'a Cow) -> Self { Self { input } } } impl<'a> Shellwords<'a> { #[inline] #[must_use] pub fn command(&self) -> &str { self.input .split_once(' ') .map_or(self.input, |(command, _)| command) } #[inline] #[must_use] pub fn args(&self) -> Args<'a> { let args = self.input.split_once(' ').map_or("", |(_, args)| args); Args::parse(args) } #[inline] pub fn input(&self) -> &str { self.input } /// Checks that the input ends with a whitespace character which is not escaped. /// /// # Examples /// /// ```rust /// # use helix_core::shellwords::Shellwords; /// assert_eq!(Shellwords::from(" ").ends_with_whitespace(), true); /// assert_eq!(Shellwords::from(":open ").ends_with_whitespace(), true); /// assert_eq!(Shellwords::from(":open foo.txt ").ends_with_whitespace(), true); /// assert_eq!(Shellwords::from(":open").ends_with_whitespace(), false); /// assert_eq!(Shellwords::from(":open a\\ ").ends_with_whitespace(), true); /// assert_eq!(Shellwords::from(":open a\\ b.txt").ends_with_whitespace(), false); /// ``` #[inline] pub fn ends_with_whitespace(&self) -> bool { self.input.ends_with(' ') } } /// An iterator over an input string which yields arguments. /// /// Splits on whitespace, but respects quoted substrings (using double quotes, single quotes, or backticks). #[derive(Debug, Clone)] pub struct Args<'a> { input: &'a str, idx: usize, start: usize, } impl<'a> Args<'a> { #[inline] fn parse(input: &'a str) -> Self { Self { input, idx: 0, start: 0, } } #[inline] pub fn is_empty(&self) -> bool { self.input.is_empty() } /// Returns the args exactly as input. /// /// # Examples /// ``` /// # use helix_core::shellwords::Args; /// let args = Args::from(r#"sed -n "s/test t/not /p""#); /// assert_eq!(r#"sed -n "s/test t/not /p""#, args.raw()); /// /// let args = Args::from(r#"cat "file name with space.txt""#); /// assert_eq!(r#"cat "file name with space.txt""#, args.raw()); /// ``` #[inline] pub fn raw(&self) -> &str { self.input } /// Returns the remainder of the args exactly as input. /// /// # Examples /// ``` /// # use helix_core::shellwords::Args; /// let mut args = Args::from(r#"sed -n "s/test t/not /p""#); /// assert_eq!("sed", args.next().unwrap()); /// assert_eq!(r#"-n "s/test t/not /p""#, args.rest()); /// ``` /// /// Never calling `next` and using `rest` is functionally equivalent to calling `raw`. #[inline] pub fn rest(&self) -> &str { &self.input[self.idx..] } /// Returns a reference to the `next()` value without advancing the iterator. /// /// Unlike `std::iter::Peakable::peek` this does not return a double reference, `&&str` /// but a normal `&str`. #[inline] #[must_use] pub fn peek(&self) -> Option<&str> { self.clone().next() } /// Returns the total number of arguments given in a command. /// /// This count is aware of all parsing rules for `Args`. #[must_use] pub fn arg_count(&self) -> usize { Self { input: self.input, idx: 0, start: 0, } .fold(0, |acc, _| acc + 1) } /// Convenient function to return an empty `Args`. /// /// When used in any iteration, it will always return `None`. #[inline(always)] pub const fn empty() -> Self { Self { input: "", idx: 0, start: 0, } } } impl<'a> Iterator for Args<'a> { type Item = &'a str; #[inline] #[allow(clippy::too_many_lines)] fn next(&mut self) -> Option { // The parser loop is split into three main blocks to handle different types of input processing: // // 1. Quote block: // - Detects an unescaped quote character, either starting an in-quote scan or, if already in-quote, // locating the closing quote to return the quoted argument. // - Handles cases where mismatched quotes are ignored and when quotes appear as the last character. // // 2. Whitespace block: // - Handles arguments separated by whitespace (space or tab), respecting quotes so quoted phrases // remain grouped together. // - Splits arguments by whitespace when outside of a quoted context and updates boundaries accordingly. // // 3. Catch-all block: // - Handles any other character, updating the `is_escaped` status if a backslash is encountered, // advancing the loop to the next character. let bytes = self.input.as_bytes(); let mut in_quotes = false; let mut quote = b'\0'; let mut is_escaped = false; while self.idx < bytes.len() { match bytes[self.idx] { b'"' | b'\'' | b'`' if !is_escaped => { if in_quotes { // Found the proper closing quote, so can return the arg and advance the state along. if bytes[self.idx] == quote { let arg = Some(&self.input[self.start..self.idx]); self.idx += 1; self.start = self.idx; return arg; } // If quote does not match the type of the opening quote, then do nothing and advance. self.idx += 1; } else if self.idx == bytes.len() - 1 { // Special case for when a quote is the last input in args. // e.g: :read "file with space.txt"" // This preserves the quote as an arg: // - `file with space` // - `"` let arg = Some(&self.input[self.idx..]); self.idx = bytes.len(); self.start = bytes.len(); return arg; } else { // Found opening quote. in_quotes = true; // Kind of quote that was found. quote = bytes[self.idx]; if self.start < self.idx { // When part of the input ends in a quote, `one two" three`, this properly returns the `two` // before advancing to the quoted arg for the next iteration: // - `one` <- previous arg // - `two` <- this step // - ` three` <- next arg let arg = Some(&self.input[self.start..self.idx]); self.idx += 1; self.start = self.idx; return arg; } // Advance after quote. self.idx += 1; // Exclude quote from arg output. self.start = self.idx; } } b' ' | b'\t' if !in_quotes => { // Found a true whitespace separator that wasn't inside quotes. // Check if there is anything to return or if its just advancing over whitespace. // `start` will only be less than `idx` when there is something to return. if self.start < self.idx { let arg = Some(&self.input[self.start..self.idx]); self.idx += 1; self.start = self.idx; return arg; } // Advance beyond the whitespace. self.idx += 1; // This is where `start` will be set to the start of an arg boundary, either encountering a word // boundary or a quote boundary. If it finds a quote, then it will be advanced again in that part // of the code. Either way, all that remains for the check above will be to return a full arg. self.start = self.idx; } _ => { // If previous loop didn't find any backslash and was already escaped it will change to false // as the backslash chain was broken. // // If the previous loop had no backslash escape, and found one this iteration, then its the start // of an escape chain. is_escaped = match (is_escaped, bytes[self.idx]) { (false, b'\\') => true, // Set `is_escaped` if the current byte is a backslash _ => false, //Reset `is_escaped` if it was true, otherwise keep `is_escaped` as false }; // Advance to next `char`. self.idx += 1; } } } // Fallback that catches when the loop would have exited but failed to return the arg between start and the end. if self.start < bytes.len() { let arg = Some(&self.input[self.start..]); self.start = bytes.len(); return arg; } // All args have been parsed. None } fn count(self) -> usize where Self: Sized, { panic!("use `arg_count` instead to get the number of arguments."); } } impl<'a> From<&'a String> for Args<'a> { fn from(args: &'a String) -> Self { Args::parse(args) } } impl<'a> From<&'a str> for Args<'a> { fn from(args: &'a str) -> Self { Args::parse(args) } } impl<'a> From<&'a Cow<'_, str>> for Args<'a> { fn from(args: &'a Cow) -> Self { Args::parse(args) } } /// Auto escape for shellwords usage. #[inline] #[must_use] pub fn escape(input: Cow) -> Cow { if !input.chars().any(|x| x.is_ascii_whitespace()) { input } else if cfg!(unix) { Cow::Owned(input.chars().fold(String::new(), |mut buf, c| { if c.is_ascii_whitespace() { buf.push('\\'); } buf.push(c); buf })) } else { Cow::Owned(format!("\"{input}\"")) } } /// Unescapes a string, converting escape sequences into their literal characters. /// /// This function handles the following escape sequences: /// - `\\n` is converted to `\n` (newline) /// - `\\t` is converted to `\t` (tab) /// - `\\u{...}` is converted to the corresponding Unicode character /// /// Other escape sequences, such as `\\` followed by any character not listed above, will remain unchanged. /// /// If input is invalid, for example if there is invalid unicode, \u{999999999}, it will return the input as is. /// /// # Examples /// /// Basic usage: /// /// ``` /// # use helix_core::shellwords::unescape; /// let unescaped = unescape("hello\\nworld"); /// assert_eq!("hello\nworld", unescaped); /// ``` /// /// Unescaping tabs: /// /// ``` /// # use helix_core::shellwords::unescape; /// let unescaped = unescape("hello\\tworld"); /// assert_eq!("hello\tworld", unescaped); /// ``` /// /// Unescaping Unicode characters: /// /// ``` /// # use helix_core::shellwords::unescape; /// let unescaped = unescape("hello\\u{1f929}world"); /// assert_eq!("hello\u{1f929}world", unescaped); /// assert_eq!("hello🤩world", unescaped); /// ``` /// /// Handling backslashes: /// /// ``` /// # use helix_core::shellwords::unescape; /// let unescaped = unescape(r"hello\\world"); /// assert_eq!(r"hello\\world", unescaped); /// /// let unescaped = unescape(r"hello\\\\world"); /// assert_eq!(r"hello\\\\world", unescaped); /// ``` /// /// # Note /// /// This function is opinionated, with a clear purpose of handling user input, not a general or generic unescaping utility, and does not unescape sequences like `\\'` or `\\\"`, leaving them as is. #[inline] #[must_use] pub fn unescape(input: &str) -> Cow<'_, str> { enum State { Normal, Escaped, Unicode, } let mut unescaped = String::new(); let mut state = State::Normal; let mut is_escaped = false; // NOTE: Max unicode code point is U+10FFFF for a maximum of 6 chars let mut unicode = SmartString::::new_const(); for (idx, ch) in input.char_indices() { match state { State::Normal => match ch { '\\' => { if !is_escaped { // PERF: As not every separator will be escaped, we use `String::new` as that has no initial // allocation. If an escape is found, then we reserve capacity thats the len of the separator, // as the new unescaped string will be at least that long. unescaped.reserve(input.len()); if idx > 0 { // First time finding an escape, so all prior chars can be added to the new unescaped // version if its not the very first char found. unescaped.push_str(&input[0..idx]); } } state = State::Escaped; is_escaped = true; } _ => { if is_escaped { unescaped.push(ch); } } }, State::Escaped => { match ch { 'n' => unescaped.push('\n'), 't' => unescaped.push('\t'), 'u' => { state = State::Unicode; continue; } // Uncomment if you want to handle '\\' to '\' // '\\' => unescaped.push('\\'), _ => { unescaped.push('\\'); unescaped.push(ch); } } state = State::Normal; } State::Unicode => match ch { '{' => continue, '}' => { let Ok(digit) = u32::from_str_radix(&unicode, 16) else { return input.into(); }; let Some(point) = char::from_u32(digit) else { return input.into(); }; unescaped.push(point); // Might be more unicode to unescape so clear for reuse. unicode.clear(); state = State::Normal; } _ => unicode.push(ch), }, } } if is_escaped { unescaped.into() } else { input.into() } } #[cfg(test)] mod test { use super::*; #[test] fn base() { let input = r#":o single_word twó wörds \three\ \"with\ escaping\\"#; let shellwords = Shellwords::from(input); let args = vec![ "single_word", "twó", "wörds", r"\three\", r#"\"with\"#, r"escaping\\", ]; assert_eq!(":o", shellwords.command()); assert_eq!(args, shellwords.args().collect::>()); } #[test] fn should_have_empty_args() { let shellwords = Shellwords::from(":quit"); assert!( shellwords.args().is_empty(), "args: `{}`", shellwords.args().next().unwrap() ); assert!(shellwords.args().next().is_none()); } #[test] fn should_return_empty_command() { let shellwords = Shellwords::from(" "); assert!(shellwords.command().is_empty()); } #[test] fn should_support_unicode_args() { assert_eq!( Shellwords::from(":sh echo 𒀀").args().collect::>(), &["echo", "𒀀"] ); assert_eq!( Shellwords::from(":sh echo 𒀀 hello world𒀀") .args() .collect::>(), &["echo", "𒀀", "hello", "world𒀀"] ); } #[test] fn should_preserve_quote_if_last_argument() { let sh = Shellwords::from(r#":read "file with space.txt"""#); let mut args = sh.args(); assert_eq!("file with space.txt", args.next().unwrap()); assert_eq!(r#"""#, args.next().unwrap()); } #[test] fn should_return_rest_of_non_closed_quote_as_one_argument() { let sh = Shellwords::from(r":rename 'should be one \'argument"); assert_eq!(r"should be one \'argument", sh.args().next().unwrap()); } #[test] fn should_respect_escaped_quote_in_what_looks_like_non_closed_arg() { let sh = Shellwords::from(r":rename 'should be one \\'argument"); let mut args = sh.args(); assert_eq!(r"should be one \\", args.next().unwrap()); assert_eq!(r"argument", args.next().unwrap()); } #[test] fn should_split_args() { assert_eq!(Shellwords::from(":o a").args().collect::>(), &["a"]); assert_eq!( Shellwords::from(":o a\\ ").args().collect::>(), &["a\\"] ); } #[test] fn should_parse_args_even_with_leading_whitespace() { // Three spaces assert_eq!( Shellwords::from(":o a").args().collect::>(), &["a"] ); } #[test] fn should_peek_next_arg_and_not_consume() { let mut args = Shellwords::from(":o a").args(); assert_eq!(Some("a"), args.peek()); assert_eq!(Some("a"), args.next()); assert_eq!(None, args.next()); } #[test] fn should_parse_single_quotes_while_respecting_escapes() { let quoted = r#":o 'single_word' 'twó wörds' '' ' ''\three\' \"with\ escaping\\' 'quote incomplete"#; let shellwords = Shellwords::from(quoted); let result = shellwords.args().collect::>(); let expected = vec![ "single_word", "twó wörds", "", " ", r#"\three\' \"with\ escaping\\"#, "quote incomplete", ]; assert_eq!(expected, result); } #[test] fn should_parse_double_quotes_while_respecting_escapes() { let dquoted = r#":o "single_word" "twó wörds" "" " ""\three\' \"with\ escaping\\" "dquote incomplete"#; let shellwords = Shellwords::from(dquoted); let result = shellwords.args().collect::>(); let expected = vec![ "single_word", "twó wörds", "", " ", r#"\three\' \"with\ escaping\\"#, "dquote incomplete", ]; assert_eq!(expected, result); } #[test] fn should_respect_escapes_with_mixed_quotes() { let dquoted = r#":o single_word 'twó wörds' "\three\' \"with\ escaping\\""no space before"'and after' $#%^@ "%^&(%^" ')(*&^%''a\\\\\b' '"#; let shellwords = Shellwords::from(dquoted); let result = shellwords.args().collect::>(); let expected = vec![ "single_word", "twó wörds", r#"\three\' \"with\ escaping\\"#, "no space before", "and after", "$#%^@", "%^&(%^", r")(*&^%", r"a\\\\\b", // Last ' is important, as if the user input an accidental quote at the end, this should be checked in // commands where there should only be one input and return an error rather than silently succeed. "'", ]; assert_eq!(expected, result); } #[test] fn should_return_rest() { let input = r#":set statusline.center ["file-type","file-encoding"]"#; let shellwords = Shellwords::from(input); let mut args = shellwords.args(); assert_eq!(":set", shellwords.command()); assert_eq!(Some("statusline.center"), args.next()); assert_eq!(r#"["file-type","file-encoding"]"#, args.rest()); } #[test] fn should_return_no_args() { let mut args = Args::parse(""); assert!(args.next().is_none()); assert!(args.is_empty()); assert!(args.arg_count() == 0); } #[test] fn should_leave_escaped_quotes() { let input = r#"\" \` \' \"with \'with \`with"#; let result = Args::parse(input).collect::>(); assert_eq!(r#"\""#, result[0]); assert_eq!(r"\`", result[1]); assert_eq!(r"\'", result[2]); assert_eq!(r#"\"with"#, result[3]); assert_eq!(r"\'with", result[4]); assert_eq!(r"\`with", result[5]); } #[test] fn should_leave_literal_newline_alone() { let result = Args::parse(r"\n").collect::>(); assert_eq!(r"\n", result[0]); } #[test] fn should_leave_literal_unicode_alone() { let result = Args::parse(r"\u{C}").collect::>(); assert_eq!(r"\u{C}", result[0]); } #[test] #[cfg(unix)] fn should_escape_unix() { assert_eq!(escape("foobar".into()), Cow::Borrowed("foobar")); assert_eq!(escape("foo bar".into()), Cow::Borrowed("foo\\ bar")); assert_eq!(escape("foo\tbar".into()), Cow::Borrowed("foo\\\tbar")); } #[test] #[cfg(windows)] fn should_escape_windows() { assert_eq!(escape("foobar".into()), Cow::Borrowed("foobar")); assert_eq!(escape("foo bar".into()), Cow::Borrowed("\"foo bar\"")); } #[test] fn should_unescape_newline() { let unescaped = unescape("hello\\nworld"); assert_eq!("hello\nworld", unescaped); } #[test] fn should_unescape_tab() { let unescaped = unescape("hello\\tworld"); assert_eq!("hello\tworld", unescaped); } #[test] fn should_unescape_unicode() { let unescaped = unescape("hello\\u{1f929}world"); assert_eq!("hello\u{1f929}world", unescaped, "char: 🤩 "); assert_eq!("hello🤩world", unescaped); } #[test] fn should_return_original_input_due_to_bad_unicode() { let unescaped = unescape("hello\\u{999999999}world"); assert_eq!("hello\\u{999999999}world", unescaped); } #[test] fn should_not_unescape_slash() { let unescaped = unescape(r"hello\\world"); assert_eq!(r"hello\\world", unescaped); let unescaped = unescape(r"hello\\\\world"); assert_eq!(r"hello\\\\world", unescaped); } #[test] fn should_not_unescape_slash_single_quote() { let unescaped = unescape("\\'"); assert_eq!(r"\'", unescaped); } #[test] fn should_not_unescape_slash_double_quote() { let unescaped = unescape("\\\""); assert_eq!(r#"\""#, unescaped); } #[test] fn should_not_change_anything() { let unescaped = unescape("'"); assert_eq!("'", unescaped); let unescaped = unescape(r#"""#); assert_eq!(r#"""#, unescaped); } #[test] fn should_only_unescape_newline_not_slash_single_quote() { let unescaped = unescape("\\n\'"); assert_eq!("\n'", unescaped); let unescaped = unescape("\\n\\'"); assert_eq!("\n\\'", unescaped); } #[test] fn should_unescape_args() { // 1f929: 🤩 let args = Args::parse(r#"'hello\u{1f929} world' '["hello", "\u{1f929}", "world"]'"#) .collect::>(); assert_eq!("hello\u{1f929} world", unescape(args[0])); assert_eq!(r#"["hello", "🤩", "world"]"#, unescape(args[1])); } }