Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 36 additions & 20 deletions crates/hstr/src/dynamic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ use triomphe::ThinArc;

use crate::{
tagged_value::{TaggedValue, MAX_INLINE_LEN},
Atom, INLINE_TAG, INLINE_TAG_INIT, LEN_OFFSET, TAG_MASK,
wtf8::Wtf8,
Atom, Wtf8Atom, INLINE_TAG, INLINE_TAG_INIT, LEN_OFFSET, TAG_MASK,
};

#[derive(PartialEq, Eq)]
Expand Down Expand Up @@ -73,6 +74,11 @@ impl AtomStore {
atom_in(self, &text.into())
}

#[inline(always)]
pub fn wtf8_atom<'a>(&mut self, text: impl Into<Cow<'a, Wtf8>>) -> Wtf8Atom {
wtf8_atom_in(self, text.into().as_bytes())
}

fn gc(&mut self) {
self.data.retain(|item, _| {
let count = ThinArc::strong_count(&item.0);
Expand All @@ -94,6 +100,14 @@ pub fn global_atom_store_gc() {
});
}

pub(crate) fn global_wtf8_atom(text: &[u8]) -> Wtf8Atom {
GLOBAL_DATA.with(|global| {
let mut store = global.borrow_mut();

wtf8_atom_in(&mut *store, text)
})
}

pub(crate) fn global_atom(text: &str) -> Atom {
GLOBAL_DATA.with(|global| {
let mut store = global.borrow_mut();
Expand All @@ -102,9 +116,7 @@ pub(crate) fn global_atom(text: &str) -> Atom {
})
}

/// This can create any kind of [Atom], although this lives in the `dynamic`
/// module.
fn atom_in<S>(storage: S, text: &str) -> Atom
fn wtf8_atom_in<S>(storage: S, text: &[u8]) -> Wtf8Atom
where
S: Storage,
{
Expand All @@ -115,9 +127,9 @@ where
let tag = INLINE_TAG_INIT | ((len as u8) << LEN_OFFSET);
let mut unsafe_data = TaggedValue::new_tag(tag);
unsafe {
unsafe_data.data_mut()[..len].copy_from_slice(text.as_bytes());
unsafe_data.data_mut()[..len].copy_from_slice(text);
}
return Atom { unsafe_data };
return Wtf8Atom { unsafe_data };
}

let hash = calc_hash(text);
Expand All @@ -129,12 +141,22 @@ where
NonNull::new_unchecked(entry)
};
debug_assert!(0 == ptr.as_ptr() as u8 & TAG_MASK);
Atom {
Wtf8Atom {
unsafe_data: TaggedValue::new_ptr(ptr),
}
}

/// Attempts to construct an Atom but only if it can be constructed inline.
/// This can create any kind of [Atom], although this lives in the `dynamic`
/// module.
fn atom_in<S>(storage: S, text: &str) -> Atom
where
S: Storage,
{
// SAFETY: `text` is valid UTF-8
unsafe { Atom::from_wtf8_unchecked(wtf8_atom_in(storage, text.as_bytes())) }
}

/// Attempts to construct an [Atom] but only if it can be constructed inline.
/// This is primarily useful in constant contexts.
pub(crate) const fn inline_atom(text: &str) -> Option<Atom> {
let len = text.len();
Expand All @@ -159,31 +181,25 @@ pub(crate) const fn inline_atom(text: &str) -> Option<Atom> {
}

trait Storage {
fn insert_entry(self, text: &str, hash: u64) -> Item;
fn insert_entry(self, text: &[u8], hash: u64) -> Item;
}

impl Storage for &'_ mut AtomStore {
fn insert_entry(self, text: &str, hash: u64) -> Item {
fn insert_entry(self, text: &[u8], hash: u64) -> Item {
// If the text is too long, interning is not worth it.
if text.len() > 512 {
return Item(ThinArc::from_header_and_slice(
Metadata { hash },
text.as_bytes(),
));
return Item(ThinArc::from_header_and_slice(Metadata { hash }, text));
}

let (entry, _) = self
.data
.raw_entry_mut()
.from_hash(hash, |key| {
key.header.header.hash == hash && key.slice.eq(text.as_bytes())
key.header.header.hash == hash && key.slice.eq(text)
})
.or_insert_with(move || {
(
Item(ThinArc::from_header_and_slice(
Metadata { hash },
text.as_bytes(),
)),
Item(ThinArc::from_header_and_slice(Metadata { hash }, text)),
(),
)
});
Expand All @@ -192,7 +208,7 @@ impl Storage for &'_ mut AtomStore {
}

#[inline(always)]
fn calc_hash(text: &str) -> u64 {
fn calc_hash(text: &[u8]) -> u64 {
let mut hasher = FxHasher::default();
text.hash(&mut hasher);
hasher.finish()
Expand Down
51 changes: 49 additions & 2 deletions crates/hstr/src/global_store.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
use std::borrow::Cow;
use std::{
borrow::Cow,
mem::{forget, ManuallyDrop},
};

use crate::{dynamic::global_atom, Atom};
use crate::{
dynamic::{global_atom, global_wtf8_atom},
wtf8::{Wtf8, Wtf8Buf},
Atom, Wtf8Atom,
};

macro_rules! direct_from_impl {
($T:ty) => {
Expand All @@ -21,3 +28,43 @@ impl From<Box<str>> for crate::Atom {
global_atom(&s)
}
}

macro_rules! direct_from_impl_wtf8 {
($T:ty) => {
impl From<$T> for Wtf8Atom {
fn from(s: $T) -> Self {
global_wtf8_atom(s.as_bytes())
}
}
};
}

direct_from_impl_wtf8!(&'_ str);
direct_from_impl_wtf8!(Cow<'_, str>);
direct_from_impl_wtf8!(String);
direct_from_impl_wtf8!(&'_ Wtf8);
direct_from_impl_wtf8!(Wtf8Buf);

impl From<&Atom> for crate::Wtf8Atom {
fn from(s: &Atom) -> Self {
forget(s.clone());
Wtf8Atom {
unsafe_data: s.unsafe_data,
}
}
}

impl From<Atom> for crate::Wtf8Atom {
fn from(s: Atom) -> Self {
let s = ManuallyDrop::new(s);
Wtf8Atom {
unsafe_data: s.unsafe_data,
}
}
}

impl From<Box<str>> for crate::Wtf8Atom {
fn from(s: Box<str>) -> Self {
global_wtf8_atom(s.as_bytes())
}
}
89 changes: 35 additions & 54 deletions crates/hstr/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use core::str;
use std::{
fmt::{Debug, Display},
hash::Hash,
mem::{self, forget, transmute},
mem::{self, forget, transmute, ManuallyDrop},
num::NonZeroU8,
ops::Deref,
str::from_utf8_unchecked,
Expand All @@ -15,13 +15,21 @@ use debug_unreachable::debug_unreachable;
use once_cell::sync::Lazy;

pub use crate::dynamic::{global_atom_store_gc, AtomStore};
use crate::tagged_value::TaggedValue;
use crate::{
macros::{get_hash, impl_from_alias, partial_eq},
tagged_value::TaggedValue,
};

mod dynamic;
mod global_store;
mod macros;
mod tagged_value;
#[cfg(test)]
mod tests;
pub mod wtf8;
mod wtf8_atom;

pub use wtf8_atom::Wtf8Atom;

/// An immutable string which is cheap to clone, compare, hash, and has small
/// size.
Expand Down Expand Up @@ -253,20 +261,7 @@ impl Atom {

impl Atom {
fn get_hash(&self) -> u64 {
match self.tag() {
DYNAMIC_TAG => {
unsafe { crate::dynamic::deref_from(self.unsafe_data) }
.header
.header
.hash
}
INLINE_TAG => {
// This is passed as input to the caller's `Hasher` implementation, so it's okay
// that this isn't really a hash
self.unsafe_data.hash()
}
_ => unsafe { debug_unreachable!() },
}
get_hash!(self)
}

fn as_str(&self) -> &str {
Expand Down Expand Up @@ -302,30 +297,7 @@ impl Atom {
impl PartialEq for Atom {
#[inline(never)]
fn eq(&self, other: &Self) -> bool {
if self.unsafe_data == other.unsafe_data {
return true;
}

// If one is inline and the other is not, the length is different.
// If one is static and the other is not, it's different.
if self.tag() != other.tag() {
return false;
}

if self.is_dynamic() && other.is_dynamic() {
let te = unsafe { crate::dynamic::deref_from(self.unsafe_data) };
let oe = unsafe { crate::dynamic::deref_from(other.unsafe_data) };

if te.header.header.hash != oe.header.header.hash {
return false;
}

return te.slice == oe.slice;
}

if self.get_hash() != other.get_hash() {
return false;
}
partial_eq!(self, other);

// If the store is different, the string may be the same, even though the
// `unsafe_data` is different
Expand Down Expand Up @@ -358,20 +330,7 @@ impl Clone for Atom {
}
}

impl Atom {
#[inline]
pub(crate) fn from_alias(alias: TaggedValue) -> Self {
if alias.tag() & TAG_MASK == DYNAMIC_TAG {
unsafe {
let arc = crate::dynamic::restore_arc(alias);
forget(arc.clone());
forget(arc);
}
}

Self { unsafe_data: alias }
}
}
impl_from_alias!(Atom);

impl Deref for Atom {
type Target = str;
Expand Down Expand Up @@ -443,6 +402,28 @@ where
}
}

impl Atom {
/// Converts a WTF-8 encoded [Wtf8Atom] to a regular UTF-8 [Atom] without
/// validation.
///
/// # Safety
///
/// The caller must ensure that the WTF-8 atom contains only valid UTF-8
/// data (no unpaired surrogates). This function performs no validation
/// and will create an invalid `Atom` if the input contains unpaired
/// surrogates.
///
/// This is a zero-cost conversion that preserves all internal optimizations
/// (inline storage, precomputed hashes, etc.) since both types have
/// identical internal representation.
pub unsafe fn from_wtf8_unchecked(s: Wtf8Atom) -> Self {
let s = ManuallyDrop::new(s);
Atom {
unsafe_data: s.unsafe_data,
}
}
}

#[cfg(test)]
mod macro_tests {

Expand Down
71 changes: 71 additions & 0 deletions crates/hstr/src/macros.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
macro_rules! get_hash {
($self:expr) => {
match $self.tag() {
DYNAMIC_TAG => {
let unsafe_data = $self.unsafe_data;
unsafe { $crate::dynamic::deref_from(unsafe_data) }
.header
.header
.hash
}
INLINE_TAG => {
// This is passed as input to the caller's `Hasher` implementation, so it's okay
// that this isn't really a hash
$self.unsafe_data.hash()
}
_ => unsafe { debug_unreachable!() },
}
};
}

macro_rules! partial_eq {
($self:expr, $other:expr) => {
if $self.unsafe_data == $other.unsafe_data {
return true;
}

// If one is inline and the other is not, the length is different.
// If one is static and the other is not, it's different.
if $self.tag() != $other.tag() {
return false;
}

if $self.is_dynamic() && $other.is_dynamic() {
let te = unsafe { $crate::dynamic::deref_from($self.unsafe_data) };
let oe = unsafe { $crate::dynamic::deref_from($other.unsafe_data) };

if te.header.header.hash != oe.header.header.hash {
return false;
}

return te.slice == oe.slice;
}

if $self.get_hash() != $other.get_hash() {
return false;
}
};
}

macro_rules! impl_from_alias {
($ty:ty) => {
impl $ty {
#[inline]
pub(crate) fn from_alias(alias: TaggedValue) -> Self {
if alias.tag() & TAG_MASK == DYNAMIC_TAG {
unsafe {
let arc = $crate::dynamic::restore_arc(alias);
forget(arc.clone());
forget(arc);
}
}

Self { unsafe_data: alias }
}
}
};
}

pub(crate) use get_hash;
pub(crate) use impl_from_alias;
pub(crate) use partial_eq;
Loading
Loading