diff options
| author | Laurenz <laurmaedje@gmail.com> | 2024-11-29 16:03:08 +0100 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2024-11-29 15:03:08 +0000 |
| commit | d40c8ab6ab4f7051a12e5ce9433439f3a5afb99f (patch) | |
| tree | 8e5b2972b1c5ede14a05ae5525c4a07fb3dda300 /crates/typst-utils/src | |
| parent | 055263ee9f2253de9f176970df9d47d3b2bd2467 (diff) | |
Compile-time `PicoStr` interning (#5491)
Diffstat (limited to 'crates/typst-utils/src')
| -rw-r--r-- | crates/typst-utils/src/lib.rs | 2 | ||||
| -rw-r--r-- | crates/typst-utils/src/pico.rs | 405 |
2 files changed, 369 insertions, 38 deletions
diff --git a/crates/typst-utils/src/lib.rs b/crates/typst-utils/src/lib.rs index e199e1bb..61703250 100644 --- a/crates/typst-utils/src/lib.rs +++ b/crates/typst-utils/src/lib.rs @@ -16,7 +16,7 @@ pub use self::bitset::{BitSet, SmallBitSet}; pub use self::deferred::Deferred; pub use self::duration::format_duration; pub use self::hash::LazyHash; -pub use self::pico::PicoStr; +pub use self::pico::{PicoStr, ResolvedPicoStr}; pub use self::round::{round_int_with_precision, round_with_precision}; pub use self::scalar::Scalar; diff --git a/crates/typst-utils/src/pico.rs b/crates/typst-utils/src/pico.rs index 7fcd3343..dbab14a1 100644 --- a/crates/typst-utils/src/pico.rs +++ b/crates/typst-utils/src/pico.rs @@ -1,87 +1,418 @@ +use std::borrow::Borrow; use std::cmp::Ordering; use std::collections::HashMap; -use std::fmt::{self, Debug, Formatter}; -use std::num::NonZeroU32; +use std::fmt::{self, Debug, Display, Formatter}; +use std::hash::{Hash, Hasher}; +use std::num::NonZeroU64; +use std::ops::Deref; use std::sync::{LazyLock, RwLock}; -/// The global string interner. -static INTERNER: LazyLock<RwLock<Interner>> = LazyLock::new(|| { - RwLock::new(Interner { to_id: HashMap::new(), from_id: Vec::new() }) -}); +/// Marks a number as a bitcode encoded `PicoStr``. +const MARKER: u64 = 1 << 63; + +/// The global runtime string interner. +static INTERNER: LazyLock<RwLock<Interner>> = + LazyLock::new(|| RwLock::new(Interner { seen: HashMap::new(), strings: Vec::new() })); /// A string interner. struct Interner { - to_id: HashMap<&'static str, PicoStr>, - from_id: Vec<&'static str>, + seen: HashMap<&'static str, PicoStr>, + strings: Vec<&'static str>, } -/// An interned string. +/// An interned string representation that is cheap to copy and hash, but more +/// expensive to access. +/// +/// This type takes up 8 bytes and is copyable and null-optimized (i.e. +/// `Option<PicoStr>` also takes 8 bytes). /// -/// The API is purposefully kept small. This is because it might be relatively -/// slow to look up a string in the interner, so we want to avoid doing it -/// unnecessarily. For this reason, the user should use the [`PicoStr::resolve`] -/// method to get the underlying string, such that the lookup is done only once. +/// Supports compile-time string interning via [`PicoStr::constant`] in two +/// flavors: +/// - Strings of length at most 12 containing only chars from 'a'-'z', '1'-'4', +/// and '-' are stored inline in the number +/// - Other strings _can_ be compile-time interned the same way, but must first +/// be added to the list in `exceptions::LIST`. +/// +/// No such restrictions apply at runtime (via [`PicoStr::intern`]). #[derive(Copy, Clone, Eq, PartialEq, Hash)] -pub struct PicoStr(NonZeroU32); +pub struct PicoStr(NonZeroU64); impl PicoStr { - /// Creates a new interned string. - pub fn new(string: &str) -> Self { + /// Intern a string at runtime. + pub fn intern(string: &str) -> PicoStr { + // Try to use bitcode or exception representations. + if let Ok(value) = PicoStr::try_constant(string) { + return value; + } + // Try to find an existing entry that we can reuse. // // We could check with just a read lock, but if the string is not yet // present, we would then need to recheck after acquiring a write lock, // which is probably not worth it. let mut interner = INTERNER.write().unwrap(); - if let Some(&id) = interner.to_id.get(string) { + if let Some(&id) = interner.seen.get(string) { return id; } // Create a new entry forever by leaking the string. PicoStr is only // used for strings that aren't created en masse, so it is okay. - let num = u32::try_from(interner.from_id.len() + 1) - .and_then(NonZeroU32::try_from) - .expect("out of string ids"); - - let id = Self(num); + let num = exceptions::LIST.len() + interner.strings.len() + 1; + let id = Self(NonZeroU64::new(num as u64).unwrap()); let string = Box::leak(string.to_string().into_boxed_str()); - interner.to_id.insert(string, id); - interner.from_id.push(string); + interner.seen.insert(string, id); + interner.strings.push(string); id } - /// Resolves the interned string. - pub fn resolve(&self) -> &'static str { - INTERNER.read().unwrap().from_id[(self.0.get() - 1) as usize] + /// Creates a compile-time constant `PicoStr`. + /// + /// Should only be used in const contexts because it can panic. + #[track_caller] + pub const fn constant(string: &'static str) -> PicoStr { + match PicoStr::try_constant(string) { + Ok(value) => value, + Err(err) => panic!("{}", err.message()), + } + } + + /// Try to intern a string statically at compile-time. + pub const fn try_constant(string: &str) -> Result<PicoStr, bitcode::EncodingError> { + // Try to encode with bitcode. + let value = match bitcode::encode(string) { + // Store representation marker in high bit. Bitcode doesn't use + // 4 high bits. + Ok(v) => v | MARKER, + + // If that fails, try to use the exception list. + Err(e) => { + if let Some(i) = exceptions::get(string) { + // Offset by one to make it non-zero. + i as u64 + 1 + } else { + return Err(e); + } + } + }; + + match NonZeroU64::new(value) { + Some(value) => Ok(Self(value)), + None => unreachable!(), + } + } + + /// Resolve to a decoded string. + pub fn resolve(self) -> ResolvedPicoStr { + // If high bit is set, this is a bitcode-encoded string. + let value = self.0.get(); + if value & MARKER != 0 { + return bitcode::decode(value & !MARKER); + } + + let index = (value - 1) as usize; + let string = if let Some(runtime) = index.checked_sub(exceptions::LIST.len()) { + INTERNER.read().unwrap().strings[runtime] + } else { + exceptions::LIST[index] + }; + + ResolvedPicoStr(Repr::Static(string)) } } impl Debug for PicoStr { fn fmt(&self, f: &mut Formatter) -> fmt::Result { - self.resolve().fmt(f) + Debug::fmt(self.resolve().as_str(), f) + } +} + +/// A 5-bit encoding for strings with length up two 12 that are restricted to a +/// specific charset. +mod bitcode { + use super::{Repr, ResolvedPicoStr}; + + /// Maps from encodings to their bytes. + const DECODE: &[u8; 32] = b"\0abcdefghijklmnopqrstuvwxyz-1234"; + + /// Maps from bytes to their encodings. + const ENCODE: &[u8; 256] = &{ + let mut map = [0; 256]; + let mut i = 0; + while i < DECODE.len() { + map[DECODE[i] as usize] = i as u8; + i += 1; + } + map + }; + + /// Try to encode a string as a 64-bit integer. + pub const fn encode(string: &str) -> Result<u64, EncodingError> { + let bytes = string.as_bytes(); + + if bytes.len() > 12 { + return Err(EncodingError::TooLong); + } + + let mut num: u64 = 0; + let mut i = bytes.len(); + while i > 0 { + i -= 1; + let b = bytes[i]; + let v = ENCODE[b as usize]; + if v == 0 { + return Err(EncodingError::BadChar); + } + num <<= 5; + num |= v as u64; + } + + Ok(num) + } + + /// Decode the string for a 64-bit integer. + pub const fn decode(mut value: u64) -> ResolvedPicoStr { + let mut buf = [0; 12]; + let mut len = 0; + + while value != 0 { + let v = value & 0b11111; + buf[len as usize] = DECODE[v as usize]; + len += 1; + value >>= 5; + } + + ResolvedPicoStr(Repr::Inline(buf, len)) + } + + /// A failure during compile-time interning. + pub enum EncodingError { + TooLong, + BadChar, + } + + impl EncodingError { + pub const fn message(&self) -> &'static str { + match self { + Self::TooLong => { + "the maximum auto-internible string length is 12. \ + you can add an exception to typst-utils/src/pico.rs \ + to intern longer strings." + } + Self::BadChar => { + "can only auto-intern the chars 'a'-'z', '1'-'4', and '-'. \ + you can add an exception to typst-utils/src/pico.rs \ + to intern other strings." + } + } + } + } +} + +/// Compile-time interned strings that cannot be encoded with `bitcode`. +mod exceptions { + use std::cmp::Ordering; + + /// A global list of non-bitcode-encodable compile-time internible strings. + pub const LIST: &[&str] = &[ + "cjk-latin-spacing", + "discretionary-ligatures", + "historical-ligatures", + "number-clearance", + "number-margin", + "numbering-scope", + "page-numbering", + "par-line-marker", + "transparentize", + ]; + + /// Try to find the index of an exception if it exists. + pub const fn get(string: &str) -> Option<usize> { + let mut lo = 0; + let mut hi = LIST.len(); + while lo < hi { + let mid = (lo + hi) / 2; + match strcmp(string, LIST[mid]) { + Ordering::Less => hi = mid, + Ordering::Greater => lo = mid + 1, + Ordering::Equal => return Some(mid), + } + } + None + } + + /// Compare two strings. + const fn strcmp(a: &str, b: &str) -> Ordering { + let a = a.as_bytes(); + let b = b.as_bytes(); + let l = min(a.len(), b.len()); + + let mut i = 0; + while i < l { + if a[i] == b[i] { + i += 1; + } else if a[i] < b[i] { + return Ordering::Less; + } else { + return Ordering::Greater; + } + } + + if i < b.len() { + Ordering::Less + } else if i < a.len() { + Ordering::Greater + } else { + Ordering::Equal + } + } + + /// Determine the minimum of two integers. + const fn min(a: usize, b: usize) -> usize { + if a < b { + a + } else { + b + } + } +} + +/// This is returned by [`PicoStr::resolve`]. +/// +/// Dereferences to a `str`. +pub struct ResolvedPicoStr(Repr); + +/// Representation of a resolved string. +enum Repr { + Inline([u8; 12], u8), + Static(&'static str), +} + +impl ResolvedPicoStr { + /// Retrieve the underlying string. + pub fn as_str(&self) -> &str { + match &self.0 { + Repr::Inline(buf, len) => unsafe { + std::str::from_utf8_unchecked(&buf[..*len as usize]) + }, + Repr::Static(s) => s, + } } } -impl Ord for PicoStr { +impl Debug for ResolvedPicoStr { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + Debug::fmt(self.as_str(), f) + } +} + +impl Display for ResolvedPicoStr { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + Display::fmt(self.as_str(), f) + } +} + +impl Deref for ResolvedPicoStr { + type Target = str; + + fn deref(&self) -> &Self::Target { + self.as_str() + } +} + +impl AsRef<str> for ResolvedPicoStr { + fn as_ref(&self) -> &str { + self.as_str() + } +} + +impl Borrow<str> for ResolvedPicoStr { + fn borrow(&self) -> &str { + self.as_str() + } +} + +impl Eq for ResolvedPicoStr {} + +impl PartialEq for ResolvedPicoStr { + fn eq(&self, other: &Self) -> bool { + self.as_str().eq(other.as_str()) + } +} + +impl Ord for ResolvedPicoStr { fn cmp(&self, other: &Self) -> Ordering { - self.resolve().cmp(other.resolve()) + self.as_str().cmp(other.as_str()) } } -impl PartialOrd for PicoStr { +impl PartialOrd for ResolvedPicoStr { fn partial_cmp(&self, other: &Self) -> Option<Ordering> { Some(self.cmp(other)) } } -impl AsRef<str> for PicoStr { - fn as_ref(&self) -> &str { - self.resolve() +impl Hash for ResolvedPicoStr { + fn hash<H: Hasher>(&self, state: &mut H) { + self.as_str().hash(state); } } -impl From<&str> for PicoStr { - fn from(value: &str) -> Self { - Self::new(value) +#[cfg(test)] +mod tests { + use super::*; + + #[track_caller] + fn roundtrip(s: &str) { + assert_eq!(PicoStr::intern(s).resolve().as_str(), s); + } + + #[test] + fn test_pico_str() { + // Test comparing compile-time and runtime-interned bitcode string. + const H1: PicoStr = PicoStr::constant("h1"); + assert_eq!(H1, PicoStr::intern("h1")); + assert_eq!(H1.resolve().as_str(), "h1"); + + // Test comparing compile-time and runtime-interned exception. + const DISC: PicoStr = PicoStr::constant("discretionary-ligatures"); + assert_eq!(DISC, PicoStr::intern("discretionary-ligatures")); + assert_eq!(DISC.resolve().as_str(), "discretionary-ligatures"); + + // Test just roundtripping some strings. + roundtrip(""); + roundtrip("hi"); + roundtrip("∆@<hi-10_"); + roundtrip("you"); + roundtrip("discretionary-ligatures"); + } + + /// Ensures that none of the exceptions is bitcode-encodable. + #[test] + fn test_exceptions_not_bitcode_encodable() { + for s in exceptions::LIST { + assert!( + bitcode::encode(s).is_err(), + "{s:?} can be encoded with bitcode and should not be an exception" + ); + } + } + + /// Ensures that the exceptions are sorted. + #[test] + fn test_exceptions_sorted() { + for group in exceptions::LIST.windows(2) { + assert!(group[0] < group[1], "{group:?} are out of order"); + } + } + + /// Ensures that all exceptions can be found. + #[test] + fn test_exception_find() { + for (i, s) in exceptions::LIST.iter().enumerate() { + assert_eq!(exceptions::get(s), Some(i), "wrong index for {s:?}"); + } + assert_eq!(exceptions::get("a"), None); + assert_eq!(exceptions::get("another-"), None); + assert_eq!(exceptions::get("z"), None); } } |
