stdx: Add `RopeSliceExt::(nth_){next,prev}_grapheme_boundary`

These functions mirror those in `helix_core::graphemes` but operate directly on byte indices rather than character indices. These are meant to be used as we transition to Ropey v2 and always use byte indices.
2025-06-10 16:00:42 -04:00 · 2025-06-10 16:00:42 -04:00 · f5dc8245ea
parent 362e97e927
commit f5dc8245ea
1 changed files with 157 additions and 0 deletions
--- a/helix-stdx/src/rope.rs
+++ b/helix-stdx/src/rope.rs
@ -151,6 +151,88 @@ pub trait RopeSliceExt<'a>: Sized {
    /// assert_eq!(graphemes.as_slice(), &["🖼️", "🏴‍☠️", "😶‍🌫️"]);
    /// ```
    fn graphemes_rev(self) -> RevRopeGraphemes<'a>;
+    /// Finds the byte index of the next grapheme boundary after `byte_idx`.
+    ///
+    /// If the byte index lies on the last grapheme cluster in the slice then this function
+    /// returns `RopeSlice::len_bytes`.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// # use ropey::Rope;
+    /// # use helix_stdx::rope::RopeSliceExt;
+    /// let text = Rope::from_str("😶‍🌫️🏴‍☠️🖼️");
+    /// let slice = text.slice(..);
+    /// let mut byte_idx = 0;
+    /// assert_eq!(slice.graphemes_at(byte_idx).next(), Some("😶‍🌫️".into()));
+    /// byte_idx = slice.next_grapheme_boundary(byte_idx);
+    /// assert_eq!(slice.graphemes_at(byte_idx).next(), Some("🏴‍☠️".into()));
+    ///
+    /// // If `byte_idx` does not lie on a character or grapheme boundary then this function is
+    /// // functionally the same as `ceil_grapheme_boundary`.
+    /// assert_eq!(slice.next_grapheme_boundary(byte_idx - 1), byte_idx);
+    /// assert_eq!(slice.next_grapheme_boundary(byte_idx - 2), byte_idx);
+    /// assert_eq!(slice.next_grapheme_boundary(byte_idx + 1), slice.next_grapheme_boundary(byte_idx));
+    /// assert_eq!(slice.next_grapheme_boundary(byte_idx + 2), slice.next_grapheme_boundary(byte_idx));
+    ///
+    /// byte_idx = slice.next_grapheme_boundary(byte_idx);
+    /// assert_eq!(slice.graphemes_at(byte_idx).next(), Some("🖼️".into()));
+    /// byte_idx = slice.next_grapheme_boundary(byte_idx);
+    /// assert_eq!(slice.graphemes_at(byte_idx).next(), None);
+    /// assert_eq!(byte_idx, slice.len_bytes());
+    /// ```
+    fn next_grapheme_boundary(self, byte_idx: usize) -> usize {
+        self.nth_next_grapheme_boundary(byte_idx, 1)
+    }
+    /// Finds the byte index of the `n`th grapheme cluster after the given `byte_idx`.
+    ///
+    /// If there are fewer than `n` grapheme clusters after `byte_idx` in the rope then this
+    /// function returns `RopeSlice::len_bytes`.
+    ///
+    /// This is functionally equivalent to calling `next_grapheme_boundary` `n` times but is more
+    /// efficient.
+    fn nth_next_grapheme_boundary(self, byte_idx: usize, n: usize) -> usize;
+    /// Finds the byte index of the previous grapheme boundary before `byte_idx`.
+    ///
+    /// If the byte index lies on the first grapheme cluster in the slice then this function
+    /// returns zero.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// # use ropey::Rope;
+    /// # use helix_stdx::rope::RopeSliceExt;
+    /// let text = Rope::from_str("😶‍🌫️🏴‍☠️🖼️");
+    /// let slice = text.slice(..);
+    /// let mut byte_idx = text.len_bytes();
+    /// assert_eq!(slice.graphemes_at(byte_idx).prev(), Some("🖼️".into()));
+    /// byte_idx = slice.prev_grapheme_boundary(byte_idx);
+    /// assert_eq!(slice.graphemes_at(byte_idx).prev(), Some("🏴‍☠️".into()));
+    ///
+    /// // If `byte_idx` does not lie on a character or grapheme boundary then this function is
+    /// // functionally the same as `floor_grapheme_boundary`.
+    /// assert_eq!(slice.prev_grapheme_boundary(byte_idx + 1), byte_idx);
+    /// assert_eq!(slice.prev_grapheme_boundary(byte_idx + 2), byte_idx);
+    /// assert_eq!(slice.prev_grapheme_boundary(byte_idx - 1), slice.prev_grapheme_boundary(byte_idx));
+    /// assert_eq!(slice.prev_grapheme_boundary(byte_idx - 2), slice.prev_grapheme_boundary(byte_idx));
+    ///
+    /// byte_idx = slice.prev_grapheme_boundary(byte_idx);
+    /// assert_eq!(slice.graphemes_at(byte_idx).prev(), Some("😶‍🌫️".into()));
+    /// byte_idx = slice.prev_grapheme_boundary(byte_idx);
+    /// assert_eq!(slice.graphemes_at(byte_idx).prev(), None);
+    /// assert_eq!(byte_idx, 0);
+    /// ```
+    fn prev_grapheme_boundary(self, byte_idx: usize) -> usize {
+        self.nth_prev_grapheme_boundary(byte_idx, 1)
+    }
+    /// Finds the byte index of the `n`th grapheme cluster before the given `byte_idx`.
+    ///
+    /// If there are fewer than `n` grapheme clusters before `byte_idx` in the rope then this
+    /// function returns zero.
+    ///
+    /// This is functionally equivalent to calling `prev_grapheme_boundary` `n` times but is more
+    /// efficient.
+    fn nth_prev_grapheme_boundary(self, byte_idx: usize, n: usize) -> usize;
 }

 impl<'a> RopeSliceExt<'a> for RopeSlice<'a> {
@ -360,6 +442,81 @@ impl<'a> RopeSliceExt<'a> for RopeSlice<'a> {
            cursor: GraphemeCursor::new(self.len_bytes(), self.len_bytes(), true),
        }
    }
+
+    fn nth_next_grapheme_boundary(self, mut byte_idx: usize, n: usize) -> usize {
+        // Bounds check
+        assert!(byte_idx <= self.len_bytes());
+
+        byte_idx = self.floor_char_boundary(byte_idx);
+
+        // Get the chunk with our byte index in it.
+        let (mut chunk, mut chunk_byte_idx, _, _) = self.chunk_at_byte(byte_idx);
+
+        // Set up the grapheme cursor.
+        let mut gc = GraphemeCursor::new(byte_idx, self.len_bytes(), true);
+
+        // Find the nth next grapheme cluster boundary.
+        for _ in 0..n {
+            loop {
+                match gc.next_boundary(chunk, chunk_byte_idx) {
+                    Ok(None) => return self.len_bytes(),
+                    Ok(Some(boundary)) => {
+                        byte_idx = boundary;
+                        break;
+                    }
+                    Err(GraphemeIncomplete::NextChunk) => {
+                        chunk_byte_idx += chunk.len();
+                        let (a, _, _, _) = self.chunk_at_byte(chunk_byte_idx);
+                        chunk = a;
+                    }
+                    Err(GraphemeIncomplete::PreContext(n)) => {
+                        let ctx_chunk = self.chunk_at_byte(n - 1).0;
+                        gc.provide_context(ctx_chunk, n - ctx_chunk.len());
+                    }
+                    _ => unreachable!(),
+                }
+            }
+        }
+
+        byte_idx
+    }
+
+    fn nth_prev_grapheme_boundary(self, mut byte_idx: usize, n: usize) -> usize {
+        // Bounds check
+        assert!(byte_idx <= self.len_bytes());
+
+        byte_idx = self.ceil_char_boundary(byte_idx);
+
+        // Get the chunk with our byte index in it.
+        let (mut chunk, mut chunk_byte_idx, _, _) = self.chunk_at_byte(byte_idx);
+
+        // Set up the grapheme cursor.
+        let mut gc = GraphemeCursor::new(byte_idx, self.len_bytes(), true);
+
+        for _ in 0..n {
+            loop {
+                match gc.prev_boundary(chunk, chunk_byte_idx) {
+                    Ok(None) => return 0,
+                    Ok(Some(boundary)) => {
+                        byte_idx = boundary;
+                        break;
+                    }
+                    Err(GraphemeIncomplete::PrevChunk) => {
+                        let (a, b, _, _) = self.chunk_at_byte(chunk_byte_idx - 1);
+                        chunk = a;
+                        chunk_byte_idx = b;
+                    }
+                    Err(GraphemeIncomplete::PreContext(n)) => {
+                        let ctx_chunk = self.chunk_at_byte(n - 1).0;
+                        gc.provide_context(ctx_chunk, n - ctx_chunk.len());
+                    }
+                    _ => unreachable!(),
+                }
+            }
+        }
+
+        byte_idx
+    }
 }

 // copied from std