2024-02-26 15:45:20 +08:00
|
|
|
use std::ops::{Bound, RangeBounds};
|
|
|
|
|
|
|
|
pub use regex_cursor::engines::meta::{Builder as RegexBuilder, Regex};
|
|
|
|
pub use regex_cursor::regex_automata::util::syntax::Config;
|
|
|
|
use regex_cursor::{Input as RegexInput, RopeyCursor};
|
2023-12-01 07:03:27 +08:00
|
|
|
use ropey::RopeSlice;
|
|
|
|
|
2024-02-26 15:45:20 +08:00
|
|
|
pub trait RopeSliceExt<'a>: Sized {
|
2023-12-01 07:03:27 +08:00
|
|
|
fn ends_with(self, text: &str) -> bool;
|
|
|
|
fn starts_with(self, text: &str) -> bool;
|
2024-02-26 15:45:20 +08:00
|
|
|
fn regex_input(self) -> RegexInput<RopeyCursor<'a>>;
|
|
|
|
fn regex_input_at_bytes<R: RangeBounds<usize>>(
|
|
|
|
self,
|
|
|
|
byte_range: R,
|
|
|
|
) -> RegexInput<RopeyCursor<'a>>;
|
|
|
|
fn regex_input_at<R: RangeBounds<usize>>(self, char_range: R) -> RegexInput<RopeyCursor<'a>>;
|
2024-02-27 21:36:25 +08:00
|
|
|
fn first_non_whitespace_char(self) -> Option<usize>;
|
|
|
|
fn last_non_whitespace_char(self) -> Option<usize>;
|
2025-01-26 02:36:20 +08:00
|
|
|
/// Finds the closest byte index not exceeding `byte_idx` which lies on a character boundary.
|
|
|
|
///
|
|
|
|
/// If `byte_idx` already lies on a character boundary then it is returned as-is. When
|
|
|
|
/// `byte_idx` lies between two character boundaries, this function returns the byte index of
|
|
|
|
/// the lesser / earlier / left-hand-side boundary.
|
|
|
|
///
|
|
|
|
/// # Example
|
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// # use ropey::RopeSlice;
|
|
|
|
/// # use helix_stdx::rope::RopeSliceExt;
|
|
|
|
/// let text = RopeSlice::from("⌚"); // three bytes: e2 8c 9a
|
|
|
|
/// assert_eq!(text.floor_char_boundary(0), 0);
|
|
|
|
/// assert_eq!(text.floor_char_boundary(1), 0);
|
|
|
|
/// assert_eq!(text.floor_char_boundary(2), 0);
|
|
|
|
/// assert_eq!(text.floor_char_boundary(3), 3);
|
|
|
|
/// ```
|
|
|
|
fn floor_char_boundary(self, byte_idx: usize) -> usize;
|
|
|
|
/// Finds the closest byte index not below `byte_idx` which lies on a character boundary.
|
|
|
|
///
|
|
|
|
/// If `byte_idx` already lies on a character boundary then it is returned as-is. When
|
|
|
|
/// `byte_idx` lies between two character boundaries, this function returns the byte index of
|
|
|
|
/// the greater / later / right-hand-side boundary.
|
|
|
|
///
|
|
|
|
/// # Example
|
|
|
|
///
|
|
|
|
/// ```
|
|
|
|
/// # use ropey::RopeSlice;
|
|
|
|
/// # use helix_stdx::rope::RopeSliceExt;
|
|
|
|
/// let text = RopeSlice::from("⌚"); // three bytes: e2 8c 9a
|
|
|
|
/// assert_eq!(text.ceil_char_boundary(0), 0);
|
|
|
|
/// assert_eq!(text.ceil_char_boundary(1), 3);
|
|
|
|
/// assert_eq!(text.ceil_char_boundary(2), 3);
|
|
|
|
/// assert_eq!(text.ceil_char_boundary(3), 3);
|
|
|
|
/// ```
|
|
|
|
fn ceil_char_boundary(self, byte_idx: usize) -> usize;
|
2023-12-01 07:03:27 +08:00
|
|
|
}
|
|
|
|
|
2024-02-26 15:45:20 +08:00
|
|
|
impl<'a> RopeSliceExt<'a> for RopeSlice<'a> {
|
2023-12-01 07:03:27 +08:00
|
|
|
fn ends_with(self, text: &str) -> bool {
|
|
|
|
let len = self.len_bytes();
|
|
|
|
if len < text.len() {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
self.get_byte_slice(len - text.len()..)
|
2025-01-10 01:02:21 +08:00
|
|
|
.is_some_and(|end| end == text)
|
2023-12-01 07:03:27 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
fn starts_with(self, text: &str) -> bool {
|
|
|
|
let len = self.len_bytes();
|
|
|
|
if len < text.len() {
|
|
|
|
return false;
|
|
|
|
}
|
2024-09-21 22:05:17 +08:00
|
|
|
self.get_byte_slice(..text.len())
|
2025-01-10 01:02:21 +08:00
|
|
|
.is_some_and(|start| start == text)
|
2023-12-01 07:03:27 +08:00
|
|
|
}
|
2024-02-26 15:45:20 +08:00
|
|
|
|
|
|
|
fn regex_input(self) -> RegexInput<RopeyCursor<'a>> {
|
|
|
|
RegexInput::new(self)
|
|
|
|
}
|
|
|
|
|
|
|
|
fn regex_input_at<R: RangeBounds<usize>>(self, char_range: R) -> RegexInput<RopeyCursor<'a>> {
|
|
|
|
let start_bound = match char_range.start_bound() {
|
|
|
|
Bound::Included(&val) => Bound::Included(self.char_to_byte(val)),
|
|
|
|
Bound::Excluded(&val) => Bound::Excluded(self.char_to_byte(val)),
|
|
|
|
Bound::Unbounded => Bound::Unbounded,
|
|
|
|
};
|
|
|
|
let end_bound = match char_range.end_bound() {
|
|
|
|
Bound::Included(&val) => Bound::Included(self.char_to_byte(val)),
|
|
|
|
Bound::Excluded(&val) => Bound::Excluded(self.char_to_byte(val)),
|
|
|
|
Bound::Unbounded => Bound::Unbounded,
|
|
|
|
};
|
|
|
|
self.regex_input_at_bytes((start_bound, end_bound))
|
|
|
|
}
|
|
|
|
fn regex_input_at_bytes<R: RangeBounds<usize>>(
|
|
|
|
self,
|
|
|
|
byte_range: R,
|
|
|
|
) -> RegexInput<RopeyCursor<'a>> {
|
|
|
|
let input = match byte_range.start_bound() {
|
|
|
|
Bound::Included(&pos) | Bound::Excluded(&pos) => {
|
|
|
|
RegexInput::new(RopeyCursor::at(self, pos))
|
|
|
|
}
|
|
|
|
Bound::Unbounded => RegexInput::new(self),
|
|
|
|
};
|
|
|
|
input.range(byte_range)
|
|
|
|
}
|
2024-02-27 21:36:25 +08:00
|
|
|
fn first_non_whitespace_char(self) -> Option<usize> {
|
|
|
|
self.chars().position(|ch| !ch.is_whitespace())
|
|
|
|
}
|
|
|
|
fn last_non_whitespace_char(self) -> Option<usize> {
|
|
|
|
self.chars_at(self.len_chars())
|
|
|
|
.reversed()
|
|
|
|
.position(|ch| !ch.is_whitespace())
|
|
|
|
.map(|pos| self.len_chars() - pos - 1)
|
|
|
|
}
|
2024-04-10 23:14:08 +08:00
|
|
|
|
2025-01-26 02:36:20 +08:00
|
|
|
// These two are adapted from std's `round_char_boundary` functions:
|
|
|
|
|
|
|
|
fn floor_char_boundary(self, byte_idx: usize) -> usize {
|
|
|
|
if byte_idx >= self.len_bytes() {
|
|
|
|
self.len_bytes()
|
|
|
|
} else {
|
|
|
|
let offset = self
|
|
|
|
.bytes_at(byte_idx + 1)
|
|
|
|
.reversed()
|
|
|
|
.take(4)
|
|
|
|
.position(is_utf8_char_boundary)
|
|
|
|
// A char can only be four bytes long so we are guaranteed to find a boundary.
|
|
|
|
.unwrap();
|
|
|
|
|
|
|
|
byte_idx - offset
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
fn ceil_char_boundary(self, byte_idx: usize) -> usize {
|
|
|
|
if byte_idx > self.len_bytes() {
|
|
|
|
self.len_bytes()
|
|
|
|
} else {
|
|
|
|
let upper_bound = self.len_bytes().min(byte_idx + 4);
|
|
|
|
self.bytes_at(byte_idx)
|
|
|
|
.position(is_utf8_char_boundary)
|
|
|
|
.map_or(upper_bound, |pos| pos + byte_idx)
|
|
|
|
}
|
|
|
|
}
|
2024-04-10 23:14:08 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// copied from std
|
|
|
|
#[inline]
|
|
|
|
const fn is_utf8_char_boundary(b: u8) -> bool {
|
|
|
|
// This is bit magic equivalent to: b < 128 || b >= 192
|
|
|
|
(b as i8) >= -0x40
|
|
|
|
}
|
|
|
|
|
|
|
|
#[cfg(test)]
|
|
|
|
mod tests {
|
|
|
|
use ropey::RopeSlice;
|
|
|
|
|
|
|
|
use crate::rope::RopeSliceExt;
|
|
|
|
|
2024-09-21 22:05:17 +08:00
|
|
|
#[test]
|
|
|
|
fn starts_with() {
|
|
|
|
assert!(RopeSlice::from("asdf").starts_with("a"));
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn ends_with() {
|
|
|
|
assert!(RopeSlice::from("asdf").ends_with("f"));
|
|
|
|
}
|
2025-01-26 02:36:20 +08:00
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn floor_ceil_char_boundary() {
|
|
|
|
let ascii = RopeSlice::from("ascii");
|
|
|
|
// When the given index lies on a character boundary, the index should not change.
|
|
|
|
for byte_idx in 0..=ascii.len_bytes() {
|
|
|
|
assert_eq!(ascii.floor_char_boundary(byte_idx), byte_idx);
|
|
|
|
assert_eq!(ascii.ceil_char_boundary(byte_idx), byte_idx);
|
|
|
|
}
|
2025-01-27 00:02:14 +08:00
|
|
|
|
|
|
|
// This is a polyfill of a method of this trait which was replaced by ceil_char_boundary.
|
|
|
|
// It returns the _character index_ of the given byte index, rounding up if it does not
|
|
|
|
// already lie on a character boundary.
|
|
|
|
fn byte_to_next_char(slice: RopeSlice, byte_idx: usize) -> usize {
|
|
|
|
slice.byte_to_char(slice.ceil_char_boundary(byte_idx))
|
|
|
|
}
|
|
|
|
|
|
|
|
for i in 0..=6 {
|
|
|
|
assert_eq!(byte_to_next_char(RopeSlice::from("foobar"), i), i);
|
|
|
|
}
|
|
|
|
for char_idx in 0..10 {
|
|
|
|
let len = "😆".len();
|
|
|
|
assert_eq!(
|
|
|
|
byte_to_next_char(RopeSlice::from("😆😆😆😆😆😆😆😆😆😆"), char_idx * len),
|
|
|
|
char_idx
|
|
|
|
);
|
|
|
|
for i in 1..=len {
|
|
|
|
assert_eq!(
|
|
|
|
byte_to_next_char(RopeSlice::from("😆😆😆😆😆😆😆😆😆😆"), char_idx * len + i),
|
|
|
|
char_idx + 1
|
|
|
|
);
|
|
|
|
}
|
|
|
|
}
|
2025-01-26 02:36:20 +08:00
|
|
|
}
|
2023-12-01 07:03:27 +08:00
|
|
|
}
|