From c68016081ad87e2795e425f9427a82c89aa246c8 Mon Sep 17 00:00:00 2001 From: Michael Davis Date: Tue, 10 Jun 2025 16:00:42 -0400 Subject: [PATCH 1/3] stdx: Add `RopeSliceExt::(nth_){next,prev}_grapheme_boundary` These functions mirror those in `helix_core::graphemes` but operate directly on byte indices rather than character indices. These are meant to be used as we transition to Ropey v2 and always use byte indices. --- helix-stdx/src/rope.rs | 141 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 141 insertions(+) diff --git a/helix-stdx/src/rope.rs b/helix-stdx/src/rope.rs index 9fc348f58..99ad79be2 100644 --- a/helix-stdx/src/rope.rs +++ b/helix-stdx/src/rope.rs @@ -151,6 +151,72 @@ pub trait RopeSliceExt<'a>: Sized { /// assert_eq!(graphemes.as_slice(), &["πŸ–ΌοΈ", "πŸ΄β€β˜ οΈ", "πŸ˜Άβ€πŸŒ«οΈ"]); /// ``` fn graphemes_rev(self) -> RevRopeGraphemes<'a>; + /// Finds the byte index of the next grapheme boundary after `byte_idx`. + /// + /// If the byte index lies on the last grapheme cluster in the slice then this function + /// returns `RopeSlice::len_bytes`. + /// + /// # Example + /// + /// ``` + /// # use ropey::Rope; + /// # use helix_stdx::rope::RopeSliceExt; + /// let text = Rope::from_str("πŸ˜Άβ€πŸŒ«οΈπŸ΄β€β˜ οΈπŸ–ΌοΈ"); + /// let slice = text.slice(..); + /// let mut byte_idx = 0; + /// assert_eq!(slice.graphemes_at(byte_idx).next(), Some("πŸ˜Άβ€πŸŒ«οΈ".into())); + /// byte_idx = slice.next_grapheme_boundary(byte_idx); + /// assert_eq!(slice.graphemes_at(byte_idx).next(), Some("πŸ΄β€β˜ οΈ".into())); + /// byte_idx = slice.next_grapheme_boundary(byte_idx); + /// assert_eq!(slice.graphemes_at(byte_idx).next(), Some("πŸ–ΌοΈ".into())); + /// byte_idx = slice.next_grapheme_boundary(byte_idx); + /// assert_eq!(slice.graphemes_at(byte_idx).next(), None); + /// assert_eq!(byte_idx, slice.len_bytes()); + /// ``` + fn next_grapheme_boundary(self, byte_idx: usize) -> usize { + self.nth_next_grapheme_boundary(byte_idx, 1) + } + /// Finds the byte index of the `n`th grapheme cluster after the given `byte_idx`. + /// + /// If there are more than `n` grapheme clusters after `byte_idx` in the rope then this + /// function returns `RopeSlice::len_bytes`. + /// + /// This is functionally equivalent to calling `next_grapheme_boundary` `n` times but is more + /// efficient. + fn nth_next_grapheme_boundary(self, byte_idx: usize, n: usize) -> usize; + /// Finds the byte index of the previous grapheme boundary before `byte_idx`. + /// + /// If the byte index lies on the first grapheme cluster in the slice then this function + /// returns zero. + /// + /// # Example + /// + /// ``` + /// # use ropey::Rope; + /// # use helix_stdx::rope::RopeSliceExt; + /// let text = Rope::from_str("πŸ˜Άβ€πŸŒ«οΈπŸ΄β€β˜ οΈπŸ–ΌοΈ"); + /// let slice = text.slice(..); + /// let mut byte_idx = text.len_bytes(); + /// assert_eq!(slice.graphemes_at(byte_idx).prev(), Some("πŸ–ΌοΈ".into())); + /// byte_idx = slice.prev_grapheme_boundary(byte_idx); + /// assert_eq!(slice.graphemes_at(byte_idx).prev(), Some("πŸ΄β€β˜ οΈ".into())); + /// byte_idx = slice.prev_grapheme_boundary(byte_idx); + /// assert_eq!(slice.graphemes_at(byte_idx).prev(), Some("πŸ˜Άβ€πŸŒ«οΈ".into())); + /// byte_idx = slice.prev_grapheme_boundary(byte_idx); + /// assert_eq!(slice.graphemes_at(byte_idx).prev(), None); + /// assert_eq!(byte_idx, 0); + /// ``` + fn prev_grapheme_boundary(self, byte_idx: usize) -> usize { + self.nth_prev_grapheme_boundary(byte_idx, 1) + } + /// Finds the byte index of the `n`th grapheme cluster before the given `byte_idx`. + /// + /// If there are fewer than `n` grapheme clusters before `byte_idx` in the rope then this + /// function returns zero. + /// + /// This is functionally equivalent to calling `prev_grapheme_boundary` `n` times but is more + /// efficient. + fn nth_prev_grapheme_boundary(self, byte_idx: usize, n: usize) -> usize; } impl<'a> RopeSliceExt<'a> for RopeSlice<'a> { @@ -360,6 +426,81 @@ impl<'a> RopeSliceExt<'a> for RopeSlice<'a> { cursor: GraphemeCursor::new(self.len_bytes(), self.len_bytes(), true), } } + + fn nth_next_grapheme_boundary(self, mut byte_idx: usize, n: usize) -> usize { + // Bounds check + debug_assert!(byte_idx <= self.len_bytes()); + + byte_idx = self.floor_char_boundary(byte_idx); + + // Get the chunk with our byte index in it. + let (mut chunk, mut chunk_byte_idx, _, _) = self.chunk_at_byte(byte_idx); + + // Set up the grapheme cursor. + let mut gc = GraphemeCursor::new(byte_idx, self.len_bytes(), true); + + // Find the nth next grapheme cluster boundary. + for _ in 0..n { + loop { + match gc.next_boundary(chunk, chunk_byte_idx) { + Ok(None) => return self.len_bytes(), + Ok(Some(n)) => { + byte_idx = n; + break; + } + Err(GraphemeIncomplete::NextChunk) => { + chunk_byte_idx += chunk.len(); + let (a, _, _, _) = self.chunk_at_byte(chunk_byte_idx); + chunk = a; + } + Err(GraphemeIncomplete::PreContext(n)) => { + let ctx_chunk = self.chunk_at_byte(n - 1).0; + gc.provide_context(ctx_chunk, n - ctx_chunk.len()); + } + _ => unreachable!(), + } + } + } + + byte_idx + } + + fn nth_prev_grapheme_boundary(self, mut byte_idx: usize, n: usize) -> usize { + // Bounds check + debug_assert!(byte_idx <= self.len_bytes()); + + byte_idx = self.ceil_char_boundary(byte_idx); + + // Get the chunk with our byte index in it. + let (mut chunk, mut chunk_byte_idx, _, _) = self.chunk_at_byte(byte_idx); + + // Set up the grapheme cursor. + let mut gc = GraphemeCursor::new(byte_idx, self.len_bytes(), true); + + for _ in 0..n { + loop { + match gc.prev_boundary(chunk, chunk_byte_idx) { + Ok(None) => return 0, + Ok(Some(n)) => { + byte_idx = n; + break; + } + Err(GraphemeIncomplete::PrevChunk) => { + let (a, b, _, _) = self.chunk_at_byte(chunk_byte_idx - 1); + chunk = a; + chunk_byte_idx = b; + } + Err(GraphemeIncomplete::PreContext(n)) => { + let ctx_chunk = self.chunk_at_byte(n - 1).0; + gc.provide_context(ctx_chunk, n - ctx_chunk.len()); + } + _ => unreachable!(), + } + } + } + + byte_idx + } } // copied from std From ab9e36069b74c6241c66ca78e775e69bf4820830 Mon Sep 17 00:00:00 2001 From: Michael Davis Date: Tue, 10 Jun 2025 20:55:47 -0400 Subject: [PATCH 2/3] stdx: Unify RopeSlice grapheme cluster iterators This style for RopeGraphemes is identical to Ropey's Chars and Bytes iterators. Being able to move the iterator types like cursors over the bytes/chars/graphemes is useful in some cases. For example see `helix_core::movement::::range_to_target`. This change also adds `RopeSliceExt::graphemes_at` for flexibility. `graphemes` and `graphemes_rev` are now implemented in terms of `graphemes_at` and `RopeGraphemes::reversed`. --- helix-stdx/src/rope.rs | 261 ++++++++++++++++++++++------------------- 1 file changed, 140 insertions(+), 121 deletions(-) diff --git a/helix-stdx/src/rope.rs b/helix-stdx/src/rope.rs index 99ad79be2..d881867e6 100644 --- a/helix-stdx/src/rope.rs +++ b/helix-stdx/src/rope.rs @@ -135,7 +135,9 @@ pub trait RopeSliceExt<'a>: Sized { /// let graphemes: Vec<_> = text.graphemes().collect(); /// assert_eq!(graphemes.as_slice(), &["πŸ˜Άβ€πŸŒ«οΈ", "πŸ΄β€β˜ οΈ", "πŸ–ΌοΈ"]); /// ``` - fn graphemes(self) -> RopeGraphemes<'a>; + fn graphemes(self) -> RopeGraphemes<'a> { + self.graphemes_at(0) + } /// Returns an iterator over the grapheme clusters in the slice, reversed. /// /// The returned iterator starts at the end of the slice and ends at the beginning of the @@ -150,7 +152,23 @@ pub trait RopeSliceExt<'a>: Sized { /// let graphemes: Vec<_> = text.graphemes_rev().collect(); /// assert_eq!(graphemes.as_slice(), &["πŸ–ΌοΈ", "πŸ΄β€β˜ οΈ", "πŸ˜Άβ€πŸŒ«οΈ"]); /// ``` - fn graphemes_rev(self) -> RevRopeGraphemes<'a>; + fn graphemes_rev(self) -> RopeGraphemes<'a>; + /// Returns an iterator over the grapheme clusters in the slice at the given byte index. + /// + /// # Example + /// + /// ``` + /// # use ropey::Rope; + /// # use helix_stdx::rope::RopeSliceExt; + /// let text = Rope::from_str("πŸ˜Άβ€πŸŒ«οΈπŸ΄β€β˜ οΈπŸ–ΌοΈ"); + /// // 14 is the byte index of the pirate flag's starting cluster boundary. + /// let graphemes: Vec<_> = text.slice(..).graphemes_at(14).collect(); + /// assert_eq!(graphemes.as_slice(), &["πŸ΄β€β˜ οΈ", "πŸ–ΌοΈ"]); + /// // 27 is the byte index of the pirate flag's ending cluster boundary. + /// let graphemes: Vec<_> = text.slice(..).graphemes_at(27).reversed().collect(); + /// assert_eq!(graphemes.as_slice(), &["πŸ΄β€β˜ οΈ", "πŸ˜Άβ€πŸŒ«οΈ"]); + /// ``` + fn graphemes_at(self, byte_idx: usize) -> RopeGraphemes<'a>; /// Finds the byte index of the next grapheme boundary after `byte_idx`. /// /// If the byte index lies on the last grapheme cluster in the slice then this function @@ -401,29 +419,21 @@ impl<'a> RopeSliceExt<'a> for RopeSlice<'a> { } } - fn graphemes(self) -> RopeGraphemes<'a> { - let mut chunks = self.chunks(); - let first_chunk = chunks.next().unwrap_or(""); + fn graphemes_rev(self) -> RopeGraphemes<'a> { + self.graphemes_at(self.len_bytes()).reversed() + } + + fn graphemes_at(self, byte_idx: usize) -> RopeGraphemes<'a> { + let (mut chunks, chunk_byte_idx, _, _) = self.chunks_at_byte(byte_idx); + let current_chunk = chunks.next().unwrap_or(""); + RopeGraphemes { text: self, chunks, - cur_chunk: first_chunk, - cur_chunk_start: 0, - cursor: GraphemeCursor::new(0, self.len_bytes(), true), - } - } - - fn graphemes_rev(self) -> RevRopeGraphemes<'a> { - let (mut chunks, mut cur_chunk_start, _, _) = self.chunks_at_byte(self.len_bytes()); - chunks.reverse(); - let first_chunk = chunks.next().unwrap_or(""); - cur_chunk_start -= first_chunk.len(); - RevRopeGraphemes { - text: self, - chunks, - cur_chunk: first_chunk, - cur_chunk_start, - cursor: GraphemeCursor::new(self.len_bytes(), self.len_bytes(), true), + current_chunk, + chunk_byte_idx, + cursor: GraphemeCursor::new(byte_idx, self.len_bytes(), true), + is_reversed: false, } } @@ -511,13 +521,18 @@ const fn is_utf8_char_boundary(b: u8) -> bool { } /// An iterator over the graphemes of a `RopeSlice`. +/// +/// This iterator is cursor-like: rather than implementing DoubleEndedIterator it can be reversed +/// like a cursor. #[derive(Clone)] pub struct RopeGraphemes<'a> { text: RopeSlice<'a>, chunks: Chunks<'a>, - cur_chunk: &'a str, - cur_chunk_start: usize, + current_chunk: &'a str, + /// Byte index of the start of the current chunk. + chunk_byte_idx: usize, cursor: GraphemeCursor, + is_reversed: bool, } impl fmt::Debug for RopeGraphemes<'_> { @@ -525,112 +540,116 @@ impl fmt::Debug for RopeGraphemes<'_> { f.debug_struct("RopeGraphemes") .field("text", &self.text) .field("chunks", &self.chunks) - .field("cur_chunk", &self.cur_chunk) - .field("cur_chunk_start", &self.cur_chunk_start) + .field("current_chunk", &self.current_chunk) + .field("chunk_byte_idx", &self.chunk_byte_idx) // .field("cursor", &self.cursor) + .field("is_reversed", &self.is_reversed) .finish() } } +impl<'a> RopeGraphemes<'a> { + #[allow(clippy::should_implement_trait)] + pub fn next(&mut self) -> Option> { + if self.is_reversed { + self.prev_impl() + } else { + self.next_impl() + } + } + + pub fn prev(&mut self) -> Option> { + if self.is_reversed { + self.next_impl() + } else { + self.prev_impl() + } + } + + pub fn reverse(&mut self) { + self.is_reversed = !self.is_reversed; + } + + #[must_use] + pub fn reversed(mut self) -> Self { + self.reverse(); + self + } + + fn next_impl(&mut self) -> Option> { + let a = self.cursor.cur_cursor(); + let b; + loop { + match self + .cursor + .next_boundary(self.current_chunk, self.chunk_byte_idx) + { + Ok(None) => return None, + Ok(Some(n)) => { + b = n; + break; + } + Err(GraphemeIncomplete::NextChunk) => { + self.chunk_byte_idx += self.current_chunk.len(); + self.current_chunk = self.chunks.next().unwrap_or(""); + } + Err(GraphemeIncomplete::PreContext(idx)) => { + let (chunk, byte_idx, _, _) = self.text.chunk_at_byte(idx.saturating_sub(1)); + self.cursor.provide_context(chunk, byte_idx); + } + _ => unreachable!(), + } + } + + if a < self.chunk_byte_idx { + Some(self.text.byte_slice(a..b)) + } else { + let a2 = a - self.chunk_byte_idx; + let b2 = b - self.chunk_byte_idx; + Some((&self.current_chunk[a2..b2]).into()) + } + } + + fn prev_impl(&mut self) -> Option> { + let a = self.cursor.cur_cursor(); + let b; + loop { + match self + .cursor + .prev_boundary(self.current_chunk, self.chunk_byte_idx) + { + Ok(None) => return None, + Ok(Some(n)) => { + b = n; + break; + } + Err(GraphemeIncomplete::PrevChunk) => { + self.current_chunk = self.chunks.prev().unwrap_or(""); + self.chunk_byte_idx -= self.current_chunk.len(); + } + Err(GraphemeIncomplete::PreContext(idx)) => { + let (chunk, byte_idx, _, _) = self.text.chunk_at_byte(idx.saturating_sub(1)); + self.cursor.provide_context(chunk, byte_idx); + } + _ => unreachable!(), + } + } + + if a >= self.chunk_byte_idx + self.current_chunk.len() { + Some(self.text.byte_slice(b..a)) + } else { + let a2 = a - self.chunk_byte_idx; + let b2 = b - self.chunk_byte_idx; + Some((&self.current_chunk[b2..a2]).into()) + } + } +} + impl<'a> Iterator for RopeGraphemes<'a> { type Item = RopeSlice<'a>; fn next(&mut self) -> Option { - let a = self.cursor.cur_cursor(); - let b; - loop { - match self - .cursor - .next_boundary(self.cur_chunk, self.cur_chunk_start) - { - Ok(None) => { - return None; - } - Ok(Some(n)) => { - b = n; - break; - } - Err(GraphemeIncomplete::NextChunk) => { - self.cur_chunk_start += self.cur_chunk.len(); - self.cur_chunk = self.chunks.next().unwrap_or(""); - } - Err(GraphemeIncomplete::PreContext(idx)) => { - let (chunk, byte_idx, _, _) = self.text.chunk_at_byte(idx.saturating_sub(1)); - self.cursor.provide_context(chunk, byte_idx); - } - _ => unreachable!(), - } - } - - if a < self.cur_chunk_start { - Some(self.text.byte_slice(a..b)) - } else { - let a2 = a - self.cur_chunk_start; - let b2 = b - self.cur_chunk_start; - Some((&self.cur_chunk[a2..b2]).into()) - } - } -} - -/// An iterator over the graphemes of a `RopeSlice` in reverse. -#[derive(Clone)] -pub struct RevRopeGraphemes<'a> { - text: RopeSlice<'a>, - chunks: Chunks<'a>, - cur_chunk: &'a str, - cur_chunk_start: usize, - cursor: GraphemeCursor, -} - -impl fmt::Debug for RevRopeGraphemes<'_> { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("RevRopeGraphemes") - .field("text", &self.text) - .field("chunks", &self.chunks) - .field("cur_chunk", &self.cur_chunk) - .field("cur_chunk_start", &self.cur_chunk_start) - // .field("cursor", &self.cursor) - .finish() - } -} - -impl<'a> Iterator for RevRopeGraphemes<'a> { - type Item = RopeSlice<'a>; - - fn next(&mut self) -> Option { - let a = self.cursor.cur_cursor(); - let b; - loop { - match self - .cursor - .prev_boundary(self.cur_chunk, self.cur_chunk_start) - { - Ok(None) => { - return None; - } - Ok(Some(n)) => { - b = n; - break; - } - Err(GraphemeIncomplete::PrevChunk) => { - self.cur_chunk = self.chunks.next().unwrap_or(""); - self.cur_chunk_start -= self.cur_chunk.len(); - } - Err(GraphemeIncomplete::PreContext(idx)) => { - let (chunk, byte_idx, _, _) = self.text.chunk_at_byte(idx.saturating_sub(1)); - self.cursor.provide_context(chunk, byte_idx); - } - _ => unreachable!(), - } - } - - if a >= self.cur_chunk_start + self.cur_chunk.len() { - Some(self.text.byte_slice(b..a)) - } else { - let a2 = a - self.cur_chunk_start; - let b2 = b - self.cur_chunk_start; - Some((&self.cur_chunk[b2..a2]).into()) - } + RopeGraphemes::next(self) } } From 6d8e874da1e30dfd1c81134dd307fc56d282b911 Mon Sep 17 00:00:00 2001 From: Michael Davis Date: Wed, 11 Jun 2025 10:37:11 -0400 Subject: [PATCH 3/3] stdx: Add an iterator over grapheme indices in a rope slice --- helix-stdx/src/rope.rs | 92 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) diff --git a/helix-stdx/src/rope.rs b/helix-stdx/src/rope.rs index d881867e6..111bb1dcc 100644 --- a/helix-stdx/src/rope.rs +++ b/helix-stdx/src/rope.rs @@ -169,6 +169,28 @@ pub trait RopeSliceExt<'a>: Sized { /// assert_eq!(graphemes.as_slice(), &["πŸ΄β€β˜ οΈ", "πŸ˜Άβ€πŸŒ«οΈ"]); /// ``` fn graphemes_at(self, byte_idx: usize) -> RopeGraphemes<'a>; + /// Returns an iterator over the grapheme clusters in a rope and the byte index where each + /// grapheme cluster starts. + /// + /// # Example + /// + /// ``` + /// # use ropey::Rope; + /// # use helix_stdx::rope::RopeSliceExt; + /// let text = Rope::from_str("πŸ˜Άβ€πŸŒ«οΈπŸ΄β€β˜ οΈπŸ–ΌοΈ"); + /// let slice = text.slice(..); + /// let graphemes: Vec<_> = slice.grapheme_indices_at(0).collect(); + /// assert_eq!( + /// graphemes.as_slice(), + /// &[(0, "πŸ˜Άβ€πŸŒ«οΈ".into()), (14, "πŸ΄β€β˜ οΈ".into()), (27, "πŸ–ΌοΈ".into())] + /// ); + /// let graphemes: Vec<_> = slice.grapheme_indices_at(slice.len_bytes()).reversed().collect(); + /// assert_eq!( + /// graphemes.as_slice(), + /// &[(27, "πŸ–ΌοΈ".into()), (14, "πŸ΄β€β˜ οΈ".into()), (0, "πŸ˜Άβ€πŸŒ«οΈ".into())] + /// ); + /// ``` + fn grapheme_indices_at(self, byte_idx: usize) -> RopeGraphemeIndices<'a>; /// Finds the byte index of the next grapheme boundary after `byte_idx`. /// /// If the byte index lies on the last grapheme cluster in the slice then this function @@ -437,6 +459,14 @@ impl<'a> RopeSliceExt<'a> for RopeSlice<'a> { } } + fn grapheme_indices_at(self, byte_idx: usize) -> RopeGraphemeIndices<'a> { + RopeGraphemeIndices { + front_offset: byte_idx, + iter: self.graphemes_at(byte_idx), + is_reversed: false, + } + } + fn nth_next_grapheme_boundary(self, mut byte_idx: usize, n: usize) -> usize { // Bounds check debug_assert!(byte_idx <= self.len_bytes()); @@ -653,6 +683,68 @@ impl<'a> Iterator for RopeGraphemes<'a> { } } +/// An iterator over the grapheme clusters in a rope and the byte indices where each grapheme +/// cluster starts. +/// +/// This iterator wraps `RopeGraphemes` and is also cursor-like. Use `reverse` or `reversed` to +/// toggle the direction of the iterator. +#[derive(Debug, Clone)] +pub struct RopeGraphemeIndices<'a> { + front_offset: usize, + iter: RopeGraphemes<'a>, + is_reversed: bool, +} + +impl<'a> RopeGraphemeIndices<'a> { + #[allow(clippy::should_implement_trait)] + pub fn next(&mut self) -> Option<(usize, RopeSlice<'a>)> { + if self.is_reversed { + self.prev_impl() + } else { + self.next_impl() + } + } + + pub fn prev(&mut self) -> Option<(usize, RopeSlice<'a>)> { + if self.is_reversed { + self.next_impl() + } else { + self.prev_impl() + } + } + + pub fn reverse(&mut self) { + self.is_reversed = !self.is_reversed; + } + + #[must_use] + pub fn reversed(mut self) -> Self { + self.reverse(); + self + } + + fn next_impl(&mut self) -> Option<(usize, RopeSlice<'a>)> { + let slice = self.iter.next()?; + let idx = self.front_offset; + self.front_offset += slice.len_bytes(); + Some((idx, slice)) + } + + fn prev_impl(&mut self) -> Option<(usize, RopeSlice<'a>)> { + let slice = self.iter.prev()?; + self.front_offset -= slice.len_bytes(); + Some((self.front_offset, slice)) + } +} + +impl<'a> Iterator for RopeGraphemeIndices<'a> { + type Item = (usize, RopeSlice<'a>); + + fn next(&mut self) -> Option { + RopeGraphemeIndices::next(self) + } +} + #[cfg(test)] mod tests { use ropey::RopeSlice;