helix/helix-stdx/src/rope.rs

use std::ops::{Bound, RangeBounds};

pub use regex_cursor::engines::meta::{Builder as RegexBuilder, Regex};
pub use regex_cursor::regex_automata::util::syntax::Config;
use regex_cursor::{Input as RegexInput, RopeyCursor};
use ropey::RopeSlice;

pub trait RopeSliceExt<'a>: Sized {
    fn ends_with(self, text: &str) -> bool;
    fn starts_with(self, text: &str) -> bool;
    fn regex_input(self) -> RegexInput<RopeyCursor<'a>>;
    fn regex_input_at_bytes<R: RangeBounds<usize>>(
        self,
        byte_range: R,
    ) -> RegexInput<RopeyCursor<'a>>;
    fn regex_input_at<R: RangeBounds<usize>>(self, char_range: R) -> RegexInput<RopeyCursor<'a>>;
    fn first_non_whitespace_char(self) -> Option<usize>;
    fn last_non_whitespace_char(self) -> Option<usize>;
    /// Finds the closest byte index not exceeding `byte_idx` which lies on a character boundary.
    ///
    /// If `byte_idx` already lies on a character boundary then it is returned as-is. When
    /// `byte_idx` lies between two character boundaries, this function returns the byte index of
    /// the lesser / earlier / left-hand-side boundary.
    ///
    /// # Example
    ///
    /// ```
    /// # use ropey::RopeSlice;
    /// # use helix_stdx::rope::RopeSliceExt;
    /// let text = RopeSlice::from("⌚"); // three bytes: e2 8c 9a
    /// assert_eq!(text.floor_char_boundary(0), 0);
    /// assert_eq!(text.floor_char_boundary(1), 0);
    /// assert_eq!(text.floor_char_boundary(2), 0);
    /// assert_eq!(text.floor_char_boundary(3), 3);
    /// ```
    fn floor_char_boundary(self, byte_idx: usize) -> usize;
    /// Finds the closest byte index not below `byte_idx` which lies on a character boundary.
    ///
    /// If `byte_idx` already lies on a character boundary then it is returned as-is. When
    /// `byte_idx` lies between two character boundaries, this function returns the byte index of
    /// the greater / later / right-hand-side boundary.
    ///
    /// # Example
    ///
    /// ```
    /// # use ropey::RopeSlice;
    /// # use helix_stdx::rope::RopeSliceExt;
    /// let text = RopeSlice::from("⌚"); // three bytes: e2 8c 9a
    /// assert_eq!(text.ceil_char_boundary(0), 0);
    /// assert_eq!(text.ceil_char_boundary(1), 3);
    /// assert_eq!(text.ceil_char_boundary(2), 3);
    /// assert_eq!(text.ceil_char_boundary(3), 3);
    /// ```
    fn ceil_char_boundary(self, byte_idx: usize) -> usize;
}

impl<'a> RopeSliceExt<'a> for RopeSlice<'a> {
    fn ends_with(self, text: &str) -> bool {
        let len = self.len_bytes();
        if len < text.len() {
            return false;
        }
        self.get_byte_slice(len - text.len()..)
            .is_some_and(|end| end == text)
    }

    fn starts_with(self, text: &str) -> bool {
        let len = self.len_bytes();
        if len < text.len() {
            return false;
        }
        self.get_byte_slice(..text.len())
            .is_some_and(|start| start == text)
    }

    fn regex_input(self) -> RegexInput<RopeyCursor<'a>> {
        RegexInput::new(self)
    }

    fn regex_input_at<R: RangeBounds<usize>>(self, char_range: R) -> RegexInput<RopeyCursor<'a>> {
        let start_bound = match char_range.start_bound() {
            Bound::Included(&val) => Bound::Included(self.char_to_byte(val)),
            Bound::Excluded(&val) => Bound::Excluded(self.char_to_byte(val)),
            Bound::Unbounded => Bound::Unbounded,
        };
        let end_bound = match char_range.end_bound() {
            Bound::Included(&val) => Bound::Included(self.char_to_byte(val)),
            Bound::Excluded(&val) => Bound::Excluded(self.char_to_byte(val)),
            Bound::Unbounded => Bound::Unbounded,
        };
        self.regex_input_at_bytes((start_bound, end_bound))
    }
    fn regex_input_at_bytes<R: RangeBounds<usize>>(
        self,
        byte_range: R,
    ) -> RegexInput<RopeyCursor<'a>> {
        let input = match byte_range.start_bound() {
            Bound::Included(&pos) | Bound::Excluded(&pos) => {
                RegexInput::new(RopeyCursor::at(self, pos))
            }
            Bound::Unbounded => RegexInput::new(self),
        };
        input.range(byte_range)
    }
    fn first_non_whitespace_char(self) -> Option<usize> {
        self.chars().position(|ch| !ch.is_whitespace())
    }
    fn last_non_whitespace_char(self) -> Option<usize> {
        self.chars_at(self.len_chars())
            .reversed()
            .position(|ch| !ch.is_whitespace())
            .map(|pos| self.len_chars() - pos - 1)
    }

    // These two are adapted from std's `round_char_boundary` functions:

    fn floor_char_boundary(self, byte_idx: usize) -> usize {
        if byte_idx >= self.len_bytes() {
            self.len_bytes()
        } else {
            let offset = self
                .bytes_at(byte_idx + 1)
                .reversed()
                .take(4)
                .position(is_utf8_char_boundary)
                // A char can only be four bytes long so we are guaranteed to find a boundary.
                .unwrap();

            byte_idx - offset
        }
    }

    fn ceil_char_boundary(self, byte_idx: usize) -> usize {
        if byte_idx > self.len_bytes() {
            self.len_bytes()
        } else {
            let upper_bound = self.len_bytes().min(byte_idx + 4);
            self.bytes_at(byte_idx)
                .position(is_utf8_char_boundary)
                .map_or(upper_bound, |pos| pos + byte_idx)
        }
    }
}

// copied from std
#[inline]
const fn is_utf8_char_boundary(b: u8) -> bool {
    // This is bit magic equivalent to: b < 128 || b >= 192
    (b as i8) >= -0x40
}

#[cfg(test)]
mod tests {
    use ropey::RopeSlice;

    use crate::rope::RopeSliceExt;

    #[test]
    fn starts_with() {
        assert!(RopeSlice::from("asdf").starts_with("a"));
    }

    #[test]
    fn ends_with() {
        assert!(RopeSlice::from("asdf").ends_with("f"));
    }

    #[test]
    fn floor_ceil_char_boundary() {
        let ascii = RopeSlice::from("ascii");
        // When the given index lies on a character boundary, the index should not change.
        for byte_idx in 0..=ascii.len_bytes() {
            assert_eq!(ascii.floor_char_boundary(byte_idx), byte_idx);
            assert_eq!(ascii.ceil_char_boundary(byte_idx), byte_idx);
        }

        // This is a polyfill of a method of this trait which was replaced by ceil_char_boundary.
        // It returns the _character index_ of the given byte index, rounding up if it does not
        // already lie on a character boundary.
        fn byte_to_next_char(slice: RopeSlice, byte_idx: usize) -> usize {
            slice.byte_to_char(slice.ceil_char_boundary(byte_idx))
        }

        for i in 0..=6 {
            assert_eq!(byte_to_next_char(RopeSlice::from("foobar"), i), i);
        }
        for char_idx in 0..10 {
            let len = "😆".len();
            assert_eq!(
                byte_to_next_char(RopeSlice::from("😆😆😆😆😆😆😆😆😆😆"), char_idx * len),
                char_idx
            );
            for i in 1..=len {
                assert_eq!(
                    byte_to_next_char(RopeSlice::from("😆😆😆😆😆😆😆😆😆😆"), char_idx * len + i),
                    char_idx + 1
                );
            }
        }
    }
}
switch to regex-cursor (#9422) 2024-02-26 15:45:20 +08:00			`use std::ops::{Bound, RangeBounds};`

			`pub use regex_cursor::engines::meta::{Builder as RegexBuilder, Regex};`
			`pub use regex_cursor::regex_automata::util::syntax::Config;`
			`use regex_cursor::{Input as RegexInput, RopeyCursor};`
refactor completion and signature help using hooks 2023-12-01 07:03:27 +08:00			`use ropey::RopeSlice;`

switch to regex-cursor (#9422) 2024-02-26 15:45:20 +08:00			`pub trait RopeSliceExt<'a>: Sized {`
refactor completion and signature help using hooks 2023-12-01 07:03:27 +08:00			`fn ends_with(self, text: &str) -> bool;`
			`fn starts_with(self, text: &str) -> bool;`
switch to regex-cursor (#9422) 2024-02-26 15:45:20 +08:00			`fn regex_input(self) -> RegexInput<RopeyCursor<'a>>;`
			`fn regex_input_at_bytes<R: RangeBounds<usize>>(`
			`self,`
			`byte_range: R,`
			`) -> RegexInput<RopeyCursor<'a>>;`
			`fn regex_input_at<R: RangeBounds<usize>>(self, char_range: R) -> RegexInput<RopeyCursor<'a>>;`
toggling of block comments (#4718) 2024-02-27 21:36:25 +08:00			`fn first_non_whitespace_char(self) -> Option<usize>;`
			`fn last_non_whitespace_char(self) -> Option<usize>;`
stdx: Add floor/ceil char boundary functions to RopeSliceExt These functions mimic `str::floor_char_boundary` and `str::floor_char_boundary` (currently unstable under `round_char_boundary`). They're useful for correcting a byte index which may not lie on a character boundary. For example you might limit a search within a slice to some fixed number of bytes. The fixed number might not lie on a boundary though so it needs to be corrected to either the earlier (floor) or later (ceil) boundary. 2025-01-26 02:36:20 +08:00			/// Finds the closest byte index not exceeding `byte_idx` which lies on a character boundary.
			`///`
			/// If `byte_idx` already lies on a character boundary then it is returned as-is. When
			/// `byte_idx` lies between two character boundaries, this function returns the byte index of
			`/// the lesser / earlier / left-hand-side boundary.`
			`///`
			`/// # Example`
			`///`
			/// ```
			`/// # use ropey::RopeSlice;`
			`/// # use helix_stdx::rope::RopeSliceExt;`
			`/// let text = RopeSlice::from("⌚"); // three bytes: e2 8c 9a`
			`/// assert_eq!(text.floor_char_boundary(0), 0);`
			`/// assert_eq!(text.floor_char_boundary(1), 0);`
			`/// assert_eq!(text.floor_char_boundary(2), 0);`
			`/// assert_eq!(text.floor_char_boundary(3), 3);`
			/// ```
			`fn floor_char_boundary(self, byte_idx: usize) -> usize;`
			/// Finds the closest byte index not below `byte_idx` which lies on a character boundary.
			`///`
			/// If `byte_idx` already lies on a character boundary then it is returned as-is. When
			/// `byte_idx` lies between two character boundaries, this function returns the byte index of
			`/// the greater / later / right-hand-side boundary.`
			`///`
			`/// # Example`
			`///`
			/// ```
			`/// # use ropey::RopeSlice;`
			`/// # use helix_stdx::rope::RopeSliceExt;`
			`/// let text = RopeSlice::from("⌚"); // three bytes: e2 8c 9a`
			`/// assert_eq!(text.ceil_char_boundary(0), 0);`
			`/// assert_eq!(text.ceil_char_boundary(1), 3);`
			`/// assert_eq!(text.ceil_char_boundary(2), 3);`
			`/// assert_eq!(text.ceil_char_boundary(3), 3);`
			/// ```
			`fn ceil_char_boundary(self, byte_idx: usize) -> usize;`
refactor completion and signature help using hooks 2023-12-01 07:03:27 +08:00			`}`

switch to regex-cursor (#9422) 2024-02-26 15:45:20 +08:00			`impl<'a> RopeSliceExt<'a> for RopeSlice<'a> {`
refactor completion and signature help using hooks 2023-12-01 07:03:27 +08:00			`fn ends_with(self, text: &str) -> bool {`
			`let len = self.len_bytes();`
			`if len < text.len() {`
			`return false;`
			`}`
			`self.get_byte_slice(len - text.len()..)`
fix(lints): clippy 1.84 2025-01-10 01:02:21 +08:00			`.is_some_and(\|end\| end == text)`
refactor completion and signature help using hooks 2023-12-01 07:03:27 +08:00			`}`

			`fn starts_with(self, text: &str) -> bool {`
			`let len = self.len_bytes();`
			`if len < text.len() {`
			`return false;`
			`}`
Fix Rope.starts_with. (#11739) Co-authored-by: Rose Hogenson <rosehogenson@posteo.net> 2024-09-21 22:05:17 +08:00			`self.get_byte_slice(..text.len())`
fix(lints): clippy 1.84 2025-01-10 01:02:21 +08:00			`.is_some_and(\|start\| start == text)`
refactor completion and signature help using hooks 2023-12-01 07:03:27 +08:00			`}`
switch to regex-cursor (#9422) 2024-02-26 15:45:20 +08:00
			`fn regex_input(self) -> RegexInput<RopeyCursor<'a>> {`
			`RegexInput::new(self)`
			`}`

			`fn regex_input_at<R: RangeBounds<usize>>(self, char_range: R) -> RegexInput<RopeyCursor<'a>> {`
			`let start_bound = match char_range.start_bound() {`
			`Bound::Included(&val) => Bound::Included(self.char_to_byte(val)),`
			`Bound::Excluded(&val) => Bound::Excluded(self.char_to_byte(val)),`
			`Bound::Unbounded => Bound::Unbounded,`
			`};`
			`let end_bound = match char_range.end_bound() {`
			`Bound::Included(&val) => Bound::Included(self.char_to_byte(val)),`
			`Bound::Excluded(&val) => Bound::Excluded(self.char_to_byte(val)),`
			`Bound::Unbounded => Bound::Unbounded,`
			`};`
			`self.regex_input_at_bytes((start_bound, end_bound))`
			`}`
			`fn regex_input_at_bytes<R: RangeBounds<usize>>(`
			`self,`
			`byte_range: R,`
			`) -> RegexInput<RopeyCursor<'a>> {`
			`let input = match byte_range.start_bound() {`
			`Bound::Included(&pos) \| Bound::Excluded(&pos) => {`
			`RegexInput::new(RopeyCursor::at(self, pos))`
			`}`
			`Bound::Unbounded => RegexInput::new(self),`
			`};`
			`input.range(byte_range)`
			`}`
toggling of block comments (#4718) 2024-02-27 21:36:25 +08:00			`fn first_non_whitespace_char(self) -> Option<usize> {`
			`self.chars().position(\|ch\| !ch.is_whitespace())`
			`}`
			`fn last_non_whitespace_char(self) -> Option<usize> {`
			`self.chars_at(self.len_chars())`
			`.reversed()`
			`.position(\|ch\| !ch.is_whitespace())`
			`.map(\|pos\| self.len_chars() - pos - 1)`
			`}`
don't manually grapheme align ts highlights (#10310) 2024-04-10 23:14:08 +08:00
stdx: Add floor/ceil char boundary functions to RopeSliceExt These functions mimic `str::floor_char_boundary` and `str::floor_char_boundary` (currently unstable under `round_char_boundary`). They're useful for correcting a byte index which may not lie on a character boundary. For example you might limit a search within a slice to some fixed number of bytes. The fixed number might not lie on a boundary though so it needs to be corrected to either the earlier (floor) or later (ceil) boundary. 2025-01-26 02:36:20 +08:00			// These two are adapted from std's `round_char_boundary` functions:

			`fn floor_char_boundary(self, byte_idx: usize) -> usize {`
			`if byte_idx >= self.len_bytes() {`
			`self.len_bytes()`
			`} else {`
			`let offset = self`
			`.bytes_at(byte_idx + 1)`
			`.reversed()`
			`.take(4)`
			`.position(is_utf8_char_boundary)`
			`// A char can only be four bytes long so we are guaranteed to find a boundary.`
			`.unwrap();`

			`byte_idx - offset`
			`}`
			`}`

			`fn ceil_char_boundary(self, byte_idx: usize) -> usize {`
			`if byte_idx > self.len_bytes() {`
			`self.len_bytes()`
			`} else {`
			`let upper_bound = self.len_bytes().min(byte_idx + 4);`
			`self.bytes_at(byte_idx)`
			`.position(is_utf8_char_boundary)`
			`.map_or(upper_bound, \|pos\| pos + byte_idx)`
			`}`
			`}`
don't manually grapheme align ts highlights (#10310) 2024-04-10 23:14:08 +08:00			`}`

			`// copied from std`
			`#[inline]`
			`const fn is_utf8_char_boundary(b: u8) -> bool {`
			`// This is bit magic equivalent to: b < 128 \|\| b >= 192`
			`(b as i8) >= -0x40`
			`}`

			`#[cfg(test)]`
			`mod tests {`
			`use ropey::RopeSlice;`

			`use crate::rope::RopeSliceExt;`

Fix Rope.starts_with. (#11739) Co-authored-by: Rose Hogenson <rosehogenson@posteo.net> 2024-09-21 22:05:17 +08:00			`#[test]`
			`fn starts_with() {`
			`assert!(RopeSlice::from("asdf").starts_with("a"));`
			`}`

			`#[test]`
			`fn ends_with() {`
			`assert!(RopeSlice::from("asdf").ends_with("f"));`
			`}`
stdx: Add floor/ceil char boundary functions to RopeSliceExt These functions mimic `str::floor_char_boundary` and `str::floor_char_boundary` (currently unstable under `round_char_boundary`). They're useful for correcting a byte index which may not lie on a character boundary. For example you might limit a search within a slice to some fixed number of bytes. The fixed number might not lie on a boundary though so it needs to be corrected to either the earlier (floor) or later (ceil) boundary. 2025-01-26 02:36:20 +08:00
			`#[test]`
			`fn floor_ceil_char_boundary() {`
			`let ascii = RopeSlice::from("ascii");`
			`// When the given index lies on a character boundary, the index should not change.`
			`for byte_idx in 0..=ascii.len_bytes() {`
			`assert_eq!(ascii.floor_char_boundary(byte_idx), byte_idx);`
			`assert_eq!(ascii.ceil_char_boundary(byte_idx), byte_idx);`
			`}`
stdx: Replace RopeSliceExt::byte_to_next_char with ceil_char_boundary The new `RopeSliceExt::ceil_char_boundary` from the parent commits can be used to implement `RopeSliceExt::byte_to_next_char` when used with `RopeSlice::byte_to_char`. That function had only one caller and that caller will eventually disappear when we switch to Ropey v2 and drop character indexing, so we can drop `byte_to_next_char` now and replace its caller with `byte_to_char` plus `ceil_char_boundary`. This change keeps the unit tests for `byte_to_next_char` and checks them against a polyfill of `byte_to_char` plus `ceil_char_boundary` to ensure that `byte_to_next_char`'s intended behavior is not changed. 2025-01-27 00:02:14 +08:00
			`// This is a polyfill of a method of this trait which was replaced by ceil_char_boundary.`
			`// It returns the _character index_ of the given byte index, rounding up if it does not`
			`// already lie on a character boundary.`
			`fn byte_to_next_char(slice: RopeSlice, byte_idx: usize) -> usize {`
			`slice.byte_to_char(slice.ceil_char_boundary(byte_idx))`
			`}`

			`for i in 0..=6 {`
			`assert_eq!(byte_to_next_char(RopeSlice::from("foobar"), i), i);`
			`}`
			`for char_idx in 0..10 {`
			`let len = "😆".len();`
			`assert_eq!(`
			`byte_to_next_char(RopeSlice::from("😆😆😆😆😆😆😆😆😆😆"), char_idx * len),`
			`char_idx`
			`);`
			`for i in 1..=len {`
			`assert_eq!(`
			`byte_to_next_char(RopeSlice::from("😆😆😆😆😆😆😆😆😆😆"), char_idx * len + i),`
			`char_idx + 1`
			`);`
			`}`
			`}`
stdx: Add floor/ceil char boundary functions to RopeSliceExt These functions mimic `str::floor_char_boundary` and `str::floor_char_boundary` (currently unstable under `round_char_boundary`). They're useful for correcting a byte index which may not lie on a character boundary. For example you might limit a search within a slice to some fixed number of bytes. The fixed number might not lie on a boundary though so it needs to be corrected to either the earlier (floor) or later (ceil) boundary. 2025-01-26 02:36:20 +08:00			`}`
refactor completion and signature help using hooks 2023-12-01 07:03:27 +08:00			`}`