summaryrefslogtreecommitdiff
path: root/crates/typst-utils/src
diff options
context:
space:
mode:
authorLaurenz <laurmaedje@gmail.com>2024-11-29 16:03:08 +0100
committerGitHub <noreply@github.com>2024-11-29 15:03:08 +0000
commitd40c8ab6ab4f7051a12e5ce9433439f3a5afb99f (patch)
tree8e5b2972b1c5ede14a05ae5525c4a07fb3dda300 /crates/typst-utils/src
parent055263ee9f2253de9f176970df9d47d3b2bd2467 (diff)
Compile-time `PicoStr` interning (#5491)
Diffstat (limited to 'crates/typst-utils/src')
-rw-r--r--crates/typst-utils/src/lib.rs2
-rw-r--r--crates/typst-utils/src/pico.rs405
2 files changed, 369 insertions, 38 deletions
diff --git a/crates/typst-utils/src/lib.rs b/crates/typst-utils/src/lib.rs
index e199e1bb..61703250 100644
--- a/crates/typst-utils/src/lib.rs
+++ b/crates/typst-utils/src/lib.rs
@@ -16,7 +16,7 @@ pub use self::bitset::{BitSet, SmallBitSet};
pub use self::deferred::Deferred;
pub use self::duration::format_duration;
pub use self::hash::LazyHash;
-pub use self::pico::PicoStr;
+pub use self::pico::{PicoStr, ResolvedPicoStr};
pub use self::round::{round_int_with_precision, round_with_precision};
pub use self::scalar::Scalar;
diff --git a/crates/typst-utils/src/pico.rs b/crates/typst-utils/src/pico.rs
index 7fcd3343..dbab14a1 100644
--- a/crates/typst-utils/src/pico.rs
+++ b/crates/typst-utils/src/pico.rs
@@ -1,87 +1,418 @@
+use std::borrow::Borrow;
use std::cmp::Ordering;
use std::collections::HashMap;
-use std::fmt::{self, Debug, Formatter};
-use std::num::NonZeroU32;
+use std::fmt::{self, Debug, Display, Formatter};
+use std::hash::{Hash, Hasher};
+use std::num::NonZeroU64;
+use std::ops::Deref;
use std::sync::{LazyLock, RwLock};
-/// The global string interner.
-static INTERNER: LazyLock<RwLock<Interner>> = LazyLock::new(|| {
- RwLock::new(Interner { to_id: HashMap::new(), from_id: Vec::new() })
-});
+/// Marks a number as a bitcode encoded `PicoStr``.
+const MARKER: u64 = 1 << 63;
+
+/// The global runtime string interner.
+static INTERNER: LazyLock<RwLock<Interner>> =
+ LazyLock::new(|| RwLock::new(Interner { seen: HashMap::new(), strings: Vec::new() }));
/// A string interner.
struct Interner {
- to_id: HashMap<&'static str, PicoStr>,
- from_id: Vec<&'static str>,
+ seen: HashMap<&'static str, PicoStr>,
+ strings: Vec<&'static str>,
}
-/// An interned string.
+/// An interned string representation that is cheap to copy and hash, but more
+/// expensive to access.
+///
+/// This type takes up 8 bytes and is copyable and null-optimized (i.e.
+/// `Option<PicoStr>` also takes 8 bytes).
///
-/// The API is purposefully kept small. This is because it might be relatively
-/// slow to look up a string in the interner, so we want to avoid doing it
-/// unnecessarily. For this reason, the user should use the [`PicoStr::resolve`]
-/// method to get the underlying string, such that the lookup is done only once.
+/// Supports compile-time string interning via [`PicoStr::constant`] in two
+/// flavors:
+/// - Strings of length at most 12 containing only chars from 'a'-'z', '1'-'4',
+/// and '-' are stored inline in the number
+/// - Other strings _can_ be compile-time interned the same way, but must first
+/// be added to the list in `exceptions::LIST`.
+///
+/// No such restrictions apply at runtime (via [`PicoStr::intern`]).
#[derive(Copy, Clone, Eq, PartialEq, Hash)]
-pub struct PicoStr(NonZeroU32);
+pub struct PicoStr(NonZeroU64);
impl PicoStr {
- /// Creates a new interned string.
- pub fn new(string: &str) -> Self {
+ /// Intern a string at runtime.
+ pub fn intern(string: &str) -> PicoStr {
+ // Try to use bitcode or exception representations.
+ if let Ok(value) = PicoStr::try_constant(string) {
+ return value;
+ }
+
// Try to find an existing entry that we can reuse.
//
// We could check with just a read lock, but if the string is not yet
// present, we would then need to recheck after acquiring a write lock,
// which is probably not worth it.
let mut interner = INTERNER.write().unwrap();
- if let Some(&id) = interner.to_id.get(string) {
+ if let Some(&id) = interner.seen.get(string) {
return id;
}
// Create a new entry forever by leaking the string. PicoStr is only
// used for strings that aren't created en masse, so it is okay.
- let num = u32::try_from(interner.from_id.len() + 1)
- .and_then(NonZeroU32::try_from)
- .expect("out of string ids");
-
- let id = Self(num);
+ let num = exceptions::LIST.len() + interner.strings.len() + 1;
+ let id = Self(NonZeroU64::new(num as u64).unwrap());
let string = Box::leak(string.to_string().into_boxed_str());
- interner.to_id.insert(string, id);
- interner.from_id.push(string);
+ interner.seen.insert(string, id);
+ interner.strings.push(string);
id
}
- /// Resolves the interned string.
- pub fn resolve(&self) -> &'static str {
- INTERNER.read().unwrap().from_id[(self.0.get() - 1) as usize]
+ /// Creates a compile-time constant `PicoStr`.
+ ///
+ /// Should only be used in const contexts because it can panic.
+ #[track_caller]
+ pub const fn constant(string: &'static str) -> PicoStr {
+ match PicoStr::try_constant(string) {
+ Ok(value) => value,
+ Err(err) => panic!("{}", err.message()),
+ }
+ }
+
+ /// Try to intern a string statically at compile-time.
+ pub const fn try_constant(string: &str) -> Result<PicoStr, bitcode::EncodingError> {
+ // Try to encode with bitcode.
+ let value = match bitcode::encode(string) {
+ // Store representation marker in high bit. Bitcode doesn't use
+ // 4 high bits.
+ Ok(v) => v | MARKER,
+
+ // If that fails, try to use the exception list.
+ Err(e) => {
+ if let Some(i) = exceptions::get(string) {
+ // Offset by one to make it non-zero.
+ i as u64 + 1
+ } else {
+ return Err(e);
+ }
+ }
+ };
+
+ match NonZeroU64::new(value) {
+ Some(value) => Ok(Self(value)),
+ None => unreachable!(),
+ }
+ }
+
+ /// Resolve to a decoded string.
+ pub fn resolve(self) -> ResolvedPicoStr {
+ // If high bit is set, this is a bitcode-encoded string.
+ let value = self.0.get();
+ if value & MARKER != 0 {
+ return bitcode::decode(value & !MARKER);
+ }
+
+ let index = (value - 1) as usize;
+ let string = if let Some(runtime) = index.checked_sub(exceptions::LIST.len()) {
+ INTERNER.read().unwrap().strings[runtime]
+ } else {
+ exceptions::LIST[index]
+ };
+
+ ResolvedPicoStr(Repr::Static(string))
}
}
impl Debug for PicoStr {
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
- self.resolve().fmt(f)
+ Debug::fmt(self.resolve().as_str(), f)
+ }
+}
+
+/// A 5-bit encoding for strings with length up two 12 that are restricted to a
+/// specific charset.
+mod bitcode {
+ use super::{Repr, ResolvedPicoStr};
+
+ /// Maps from encodings to their bytes.
+ const DECODE: &[u8; 32] = b"\0abcdefghijklmnopqrstuvwxyz-1234";
+
+ /// Maps from bytes to their encodings.
+ const ENCODE: &[u8; 256] = &{
+ let mut map = [0; 256];
+ let mut i = 0;
+ while i < DECODE.len() {
+ map[DECODE[i] as usize] = i as u8;
+ i += 1;
+ }
+ map
+ };
+
+ /// Try to encode a string as a 64-bit integer.
+ pub const fn encode(string: &str) -> Result<u64, EncodingError> {
+ let bytes = string.as_bytes();
+
+ if bytes.len() > 12 {
+ return Err(EncodingError::TooLong);
+ }
+
+ let mut num: u64 = 0;
+ let mut i = bytes.len();
+ while i > 0 {
+ i -= 1;
+ let b = bytes[i];
+ let v = ENCODE[b as usize];
+ if v == 0 {
+ return Err(EncodingError::BadChar);
+ }
+ num <<= 5;
+ num |= v as u64;
+ }
+
+ Ok(num)
+ }
+
+ /// Decode the string for a 64-bit integer.
+ pub const fn decode(mut value: u64) -> ResolvedPicoStr {
+ let mut buf = [0; 12];
+ let mut len = 0;
+
+ while value != 0 {
+ let v = value & 0b11111;
+ buf[len as usize] = DECODE[v as usize];
+ len += 1;
+ value >>= 5;
+ }
+
+ ResolvedPicoStr(Repr::Inline(buf, len))
+ }
+
+ /// A failure during compile-time interning.
+ pub enum EncodingError {
+ TooLong,
+ BadChar,
+ }
+
+ impl EncodingError {
+ pub const fn message(&self) -> &'static str {
+ match self {
+ Self::TooLong => {
+ "the maximum auto-internible string length is 12. \
+ you can add an exception to typst-utils/src/pico.rs \
+ to intern longer strings."
+ }
+ Self::BadChar => {
+ "can only auto-intern the chars 'a'-'z', '1'-'4', and '-'. \
+ you can add an exception to typst-utils/src/pico.rs \
+ to intern other strings."
+ }
+ }
+ }
+ }
+}
+
+/// Compile-time interned strings that cannot be encoded with `bitcode`.
+mod exceptions {
+ use std::cmp::Ordering;
+
+ /// A global list of non-bitcode-encodable compile-time internible strings.
+ pub const LIST: &[&str] = &[
+ "cjk-latin-spacing",
+ "discretionary-ligatures",
+ "historical-ligatures",
+ "number-clearance",
+ "number-margin",
+ "numbering-scope",
+ "page-numbering",
+ "par-line-marker",
+ "transparentize",
+ ];
+
+ /// Try to find the index of an exception if it exists.
+ pub const fn get(string: &str) -> Option<usize> {
+ let mut lo = 0;
+ let mut hi = LIST.len();
+ while lo < hi {
+ let mid = (lo + hi) / 2;
+ match strcmp(string, LIST[mid]) {
+ Ordering::Less => hi = mid,
+ Ordering::Greater => lo = mid + 1,
+ Ordering::Equal => return Some(mid),
+ }
+ }
+ None
+ }
+
+ /// Compare two strings.
+ const fn strcmp(a: &str, b: &str) -> Ordering {
+ let a = a.as_bytes();
+ let b = b.as_bytes();
+ let l = min(a.len(), b.len());
+
+ let mut i = 0;
+ while i < l {
+ if a[i] == b[i] {
+ i += 1;
+ } else if a[i] < b[i] {
+ return Ordering::Less;
+ } else {
+ return Ordering::Greater;
+ }
+ }
+
+ if i < b.len() {
+ Ordering::Less
+ } else if i < a.len() {
+ Ordering::Greater
+ } else {
+ Ordering::Equal
+ }
+ }
+
+ /// Determine the minimum of two integers.
+ const fn min(a: usize, b: usize) -> usize {
+ if a < b {
+ a
+ } else {
+ b
+ }
+ }
+}
+
+/// This is returned by [`PicoStr::resolve`].
+///
+/// Dereferences to a `str`.
+pub struct ResolvedPicoStr(Repr);
+
+/// Representation of a resolved string.
+enum Repr {
+ Inline([u8; 12], u8),
+ Static(&'static str),
+}
+
+impl ResolvedPicoStr {
+ /// Retrieve the underlying string.
+ pub fn as_str(&self) -> &str {
+ match &self.0 {
+ Repr::Inline(buf, len) => unsafe {
+ std::str::from_utf8_unchecked(&buf[..*len as usize])
+ },
+ Repr::Static(s) => s,
+ }
}
}
-impl Ord for PicoStr {
+impl Debug for ResolvedPicoStr {
+ fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+ Debug::fmt(self.as_str(), f)
+ }
+}
+
+impl Display for ResolvedPicoStr {
+ fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+ Display::fmt(self.as_str(), f)
+ }
+}
+
+impl Deref for ResolvedPicoStr {
+ type Target = str;
+
+ fn deref(&self) -> &Self::Target {
+ self.as_str()
+ }
+}
+
+impl AsRef<str> for ResolvedPicoStr {
+ fn as_ref(&self) -> &str {
+ self.as_str()
+ }
+}
+
+impl Borrow<str> for ResolvedPicoStr {
+ fn borrow(&self) -> &str {
+ self.as_str()
+ }
+}
+
+impl Eq for ResolvedPicoStr {}
+
+impl PartialEq for ResolvedPicoStr {
+ fn eq(&self, other: &Self) -> bool {
+ self.as_str().eq(other.as_str())
+ }
+}
+
+impl Ord for ResolvedPicoStr {
fn cmp(&self, other: &Self) -> Ordering {
- self.resolve().cmp(other.resolve())
+ self.as_str().cmp(other.as_str())
}
}
-impl PartialOrd for PicoStr {
+impl PartialOrd for ResolvedPicoStr {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
-impl AsRef<str> for PicoStr {
- fn as_ref(&self) -> &str {
- self.resolve()
+impl Hash for ResolvedPicoStr {
+ fn hash<H: Hasher>(&self, state: &mut H) {
+ self.as_str().hash(state);
}
}
-impl From<&str> for PicoStr {
- fn from(value: &str) -> Self {
- Self::new(value)
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[track_caller]
+ fn roundtrip(s: &str) {
+ assert_eq!(PicoStr::intern(s).resolve().as_str(), s);
+ }
+
+ #[test]
+ fn test_pico_str() {
+ // Test comparing compile-time and runtime-interned bitcode string.
+ const H1: PicoStr = PicoStr::constant("h1");
+ assert_eq!(H1, PicoStr::intern("h1"));
+ assert_eq!(H1.resolve().as_str(), "h1");
+
+ // Test comparing compile-time and runtime-interned exception.
+ const DISC: PicoStr = PicoStr::constant("discretionary-ligatures");
+ assert_eq!(DISC, PicoStr::intern("discretionary-ligatures"));
+ assert_eq!(DISC.resolve().as_str(), "discretionary-ligatures");
+
+ // Test just roundtripping some strings.
+ roundtrip("");
+ roundtrip("hi");
+ roundtrip("∆@<hi-10_");
+ roundtrip("you");
+ roundtrip("discretionary-ligatures");
+ }
+
+ /// Ensures that none of the exceptions is bitcode-encodable.
+ #[test]
+ fn test_exceptions_not_bitcode_encodable() {
+ for s in exceptions::LIST {
+ assert!(
+ bitcode::encode(s).is_err(),
+ "{s:?} can be encoded with bitcode and should not be an exception"
+ );
+ }
+ }
+
+ /// Ensures that the exceptions are sorted.
+ #[test]
+ fn test_exceptions_sorted() {
+ for group in exceptions::LIST.windows(2) {
+ assert!(group[0] < group[1], "{group:?} are out of order");
+ }
+ }
+
+ /// Ensures that all exceptions can be found.
+ #[test]
+ fn test_exception_find() {
+ for (i, s) in exceptions::LIST.iter().enumerate() {
+ assert_eq!(exceptions::get(s), Some(i), "wrong index for {s:?}");
+ }
+ assert_eq!(exceptions::get("a"), None);
+ assert_eq!(exceptions::get("another-"), None);
+ assert_eq!(exceptions::get("z"), None);
}
}