diff --git a/helix-stdx/src/lib.rs b/helix-stdx/src/lib.rs index d09df587a..93cc8304a 100644 --- a/helix-stdx/src/lib.rs +++ b/helix-stdx/src/lib.rs @@ -3,5 +3,6 @@ pub mod faccess; pub mod path; pub mod range; pub mod rope; +pub mod str; pub use range::Range; diff --git a/helix-stdx/src/str.rs b/helix-stdx/src/str.rs new file mode 100644 index 000000000..967102a8a --- /dev/null +++ b/helix-stdx/src/str.rs @@ -0,0 +1,288 @@ +//! Utilities for working with strings and specialized string types. + +use std::{ + alloc, + borrow::{Borrow, Cow}, + fmt, hash, + mem::{size_of, ManuallyDrop}, + ptr::{self, NonNull}, + slice, str, +}; + +/// A very very small owned string type. +/// +/// This type is like a `Box` and is similarly two `usize`s large. It can only fit strings +/// with a byte length smaller than 256. On 64-bit machines this type stores up to 15 bytes inline +/// (7 bytes on 32-bit machines). One byte is used to store the length. For strings short enough +/// to be stored inline, the remaining 15 (or 7) bytes store the content inline. Otherwise the +/// second `usize` of memory is a thin pointer to the string content. +/// +/// Unlike `Box` this type is not null-pointer optimized. +#[repr(C)] +pub struct TinyBoxedStr { + len: u8, + prefix: [u8; Self::PREFIX_LEN], + trailing: TinyBoxedStrTrailing, +} + +#[repr(C)] +union TinyBoxedStrTrailing { + suffix: [u8; TinyBoxedStr::SUFFIX_LEN], + ptr: ManuallyDrop>, +} + +impl TinyBoxedStr { + // 1 usize minus the byte to store the length. + const PREFIX_LEN: usize = size_of::() - size_of::(); + // The other `usize` is a pointer or the end parts of an inline string. + const SUFFIX_LEN: usize = size_of::(); + // ... for a grand total of 15 bytes for 64-bit machines or 7 for 32-bit. + const INLINE_LEN: u8 = (Self::PREFIX_LEN + Self::SUFFIX_LEN) as u8; + + pub const MAX_LEN: usize = u8::MAX as usize; + + #[inline] + pub fn len(&self) -> usize { + self.len as usize + } + + #[inline] + pub fn is_empty(&self) -> bool { + self.len == 0 + } + + pub fn as_bytes(&self) -> &[u8] { + let ptr = if self.len <= Self::INLINE_LEN { + let ptr = ptr::from_ref(self); + unsafe { ptr::addr_of!((*ptr).prefix) }.cast() + } else { + unsafe { self.trailing.ptr }.as_ptr() + }; + unsafe { slice::from_raw_parts(ptr, self.len()) } + } + + #[inline] + pub fn as_str(&self) -> &str { + unsafe { str::from_utf8_unchecked(self.as_bytes()) } + } + + /// Exposes the bytes as a mutable slice. + /// + /// When a string is short enough to be inline, this slice points to the `prefix` and `suffix` + /// parts of the struct. Otherwise the slice wraps the pointer to the allocation. + /// + /// SAFETY: As such, if the string is allocated then it is the caller's responsibility to + /// ensure that any modifications made to `&s.as_bytes_mut[..Self::PREFIX_LEN]` are written + /// to `s.prefix` as well if the string is allocated. + /// + /// SAFETY: It is also the caller's responsibility to ensure that edits to the bytes do not + /// make the bytes invalid UTF-8. + unsafe fn as_bytes_mut(&mut self) -> &mut [u8] { + let ptr = if self.len <= Self::INLINE_LEN { + let ptr = ptr::from_mut(self); + unsafe { ptr::addr_of_mut!((*ptr).prefix) }.cast() + } else { + unsafe { self.trailing.ptr }.as_ptr() + }; + unsafe { slice::from_raw_parts_mut(ptr, self.len()) } + } + + fn layout(len: u8) -> alloc::Layout { + alloc::Layout::array::(len as usize) + .expect("a valid layout for an array") + .pad_to_align() + } + + /// Creates a new `TinyBoxedStr` of the given length with all bytes zeroed. + /// + /// While this is used to create uninitialized strings which are later filled, note that the + /// zero byte is valid UTF-8 so the zeroed representation is always valid. + fn zeroed(len: u8) -> Self { + let trailing = if len <= Self::INLINE_LEN { + TinyBoxedStrTrailing { + suffix: [0; Self::SUFFIX_LEN], + } + } else { + let layout = Self::layout(len); + let nullable = unsafe { alloc::alloc_zeroed(layout) }; + let Some(ptr) = NonNull::new(nullable) else { + alloc::handle_alloc_error(layout); + }; + TinyBoxedStrTrailing { + ptr: ManuallyDrop::new(ptr), + } + }; + Self { + len, + prefix: [0; Self::PREFIX_LEN], + trailing, + } + } +} + +#[derive(Debug)] +pub struct TooLongError; + +impl fmt::Display for TooLongError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str("string was too long to be stored as a `TinyBoxedStr` (max 256 bytes)") + } +} + +impl std::error::Error for TooLongError {} + +impl TryFrom<&str> for TinyBoxedStr { + type Error = TooLongError; + + fn try_from(s: &str) -> Result { + if s.len() > Self::MAX_LEN { + return Err(TooLongError); + } + + let mut this = Self::zeroed(s.len() as u8); + // SAFETY: if `s` is valid UTF-8, `this`'s bytes will be valid UTF-8. + unsafe { this.as_bytes_mut() }.copy_from_slice(s.as_bytes()); + if this.len > Self::INLINE_LEN { + this.prefix + .copy_from_slice(&s.as_bytes()[..Self::PREFIX_LEN]); + } + Ok(this) + } +} + +// NOTE: converting from a `String` to a `TinyBoxedStr` is cheap when the string's length is equal +// to its capacity. +impl TryFrom for TinyBoxedStr { + type Error = TooLongError; + + fn try_from(s: String) -> Result { + // Inline strings must be cloned. It's a constant number of bytes to copy though. + if s.len() <= Self::INLINE_LEN as usize { + return s.as_str().try_into(); + } + + // Otherwise we can sometimes steal the `String`'s allocation if the string is allocated + // exactly (i.e. `s.len() == s.capacity()`). A `Box` is defined as being allocated + // exactly so we first convert to `Box` (which will reallocate if the capacity is not + // the same as the length) and then steal its pointer. + + if s.len() > Self::MAX_LEN { + return Err(TooLongError); + } + + let len = s.len() as u8; + let mut prefix = [0; Self::PREFIX_LEN]; + prefix.copy_from_slice(&s.as_bytes()[..Self::PREFIX_LEN]); + let ptr = Box::into_raw(s.into_boxed_str()).cast::(); + // SAFETY: `Box::into_raw` docs guarantee non-null. + let ptr = ManuallyDrop::new(unsafe { NonNull::new_unchecked(ptr) }); + let trailing = TinyBoxedStrTrailing { ptr }; + + Ok(Self { + len, + prefix, + trailing, + }) + } +} + +impl TryFrom> for TinyBoxedStr { + type Error = TooLongError; + + fn try_from(s: Cow<'_, str>) -> Result { + match s { + Cow::Borrowed(s) => s.try_into(), + Cow::Owned(s) => s.try_into(), + } + } +} + +impl TryFrom> for TinyBoxedStr { + type Error = TooLongError; + + fn try_from(slice: ropey::RopeSlice<'_>) -> Result { + // `impl From for String` uses `String::with_capacity` so we can reuse its + // allocation whenever it allocates `slice.len_bytes()`. + let s: Cow = slice.into(); + s.try_into() + } +} + +impl Drop for TinyBoxedStr { + fn drop(&mut self) { + if self.len > Self::INLINE_LEN { + let ptr = unsafe { self.trailing.ptr }.as_ptr(); + let layout = Self::layout(self.len); + unsafe { alloc::dealloc(ptr, layout) } + } + } +} + +impl Clone for TinyBoxedStr { + fn clone(&self) -> Self { + let mut this = Self::zeroed(self.len); + // SAFETY: if `self` is valid UTF-8 then `this` will be too. + unsafe { this.as_bytes_mut() }.copy_from_slice(self.as_bytes()); + if this.len > Self::INLINE_LEN { + this.prefix + .copy_from_slice(&self.as_bytes()[..Self::PREFIX_LEN]); + } + this + } +} + +impl Default for TinyBoxedStr { + fn default() -> Self { + Self::zeroed(0) + } +} + +impl AsRef for TinyBoxedStr { + fn as_ref(&self) -> &str { + self.as_str() + } +} + +impl Borrow for TinyBoxedStr { + fn borrow(&self) -> &str { + self.as_str() + } +} + +// NOTE: this could be specialized to optimize the number of comparison operations. We could cast +// the first `usize` of memory together to do a single comparison (and same for the suffixes). +// This optimization would only matter if we compared these strings very frequently however. +impl PartialEq for TinyBoxedStr { + fn eq(&self, other: &Self) -> bool { + self.as_str() == other.as_str() + } +} + +impl Eq for TinyBoxedStr {} + +impl PartialEq for TinyBoxedStr { + fn eq(&self, other: &str) -> bool { + self.as_str() == other + } +} + +impl hash::Hash for TinyBoxedStr { + fn hash(&self, state: &mut H) { + self.as_str().hash(state) + } +} + +impl fmt::Debug for TinyBoxedStr { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.as_str().fmt(f) + } +} + +impl fmt::Display for TinyBoxedStr { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.as_str().fmt(f) + } +} + +unsafe impl Send for TinyBoxedStr {} +unsafe impl Sync for TinyBoxedStr {} diff --git a/helix-view/src/handlers/word_index.rs b/helix-view/src/handlers/word_index.rs index f84e77e28..61b65f700 100644 --- a/helix-view/src/handlers/word_index.rs +++ b/helix-view/src/handlers/word_index.rs @@ -123,8 +123,7 @@ const MIN_WORD_GRAPHEMES: usize = 3; /// Maximum word length allowed (in chars) const MAX_WORD_LEN: usize = 50; -// TODO: choose or create a suitable small string type. -type Word = String; +type Word = helix_stdx::str::TinyBoxedStr; #[derive(Debug, Default)] struct WordIndexInner { @@ -142,11 +141,16 @@ impl WordIndexInner { } fn insert(&mut self, word: RopeSlice) { + assert!(word.len_chars() <= MAX_WORD_LEN); + // The word must be shorter than `TinyBoxedStr::MAX` because it is fewer than 50 + // characters and characters take at most four bytes. + assert!(word.len_bytes() < Word::MAX_LEN); + let word: Cow = word.into(); if let Some(rc) = self.words.get_mut(word.as_ref()) { *rc = rc.saturating_add(1); } else { - self.words.insert(word.into_owned(), 1); + self.words.insert(word.try_into().unwrap(), 1); } } @@ -172,7 +176,10 @@ impl WordIndex { let inner = self.inner.read(); let mut matches = fuzzy_match(pattern, inner.words(), false); matches.sort_unstable_by_key(|(_, score)| *score); - matches.into_iter().map(|(word, _)| word.clone()).collect() + matches + .into_iter() + .map(|(word, _)| word.to_string()) + .collect() } fn add_document(&self, text: &Rope) { @@ -409,7 +416,7 @@ mod tests { impl WordIndex { fn words(&self) -> HashSet { let inner = self.inner.read(); - inner.words().cloned().collect() + inner.words().map(|w| w.to_string()).collect() } }