diff options
| author | Laurenz <laurmaedje@gmail.com> | 2025-01-08 11:57:56 +0100 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-01-08 10:57:56 +0000 |
| commit | dacd6acd5e73d35c6e7a7a3b144f16ae70d03daa (patch) | |
| tree | ca75ce7d4d5365fbe2424a356cefe9cb223d429f /crates/typst-library/src/foundations | |
| parent | 0a374d238016c0101d11cbc3f4bc621f3895ad36 (diff) | |
More flexible and efficient `Bytes` representation (#5670)
Diffstat (limited to 'crates/typst-library/src/foundations')
| -rw-r--r-- | crates/typst-library/src/foundations/bytes.rs | 144 | ||||
| -rw-r--r-- | crates/typst-library/src/foundations/float.rs | 12 | ||||
| -rw-r--r-- | crates/typst-library/src/foundations/int.rs | 5 | ||||
| -rw-r--r-- | crates/typst-library/src/foundations/plugin.rs | 2 | ||||
| -rw-r--r-- | crates/typst-library/src/foundations/value.rs | 6 |
5 files changed, 119 insertions, 50 deletions
diff --git a/crates/typst-library/src/foundations/bytes.rs b/crates/typst-library/src/foundations/bytes.rs index 05fe4763..20034d07 100644 --- a/crates/typst-library/src/foundations/bytes.rs +++ b/crates/typst-library/src/foundations/bytes.rs @@ -1,5 +1,6 @@ -use std::borrow::Cow; +use std::any::Any; use std::fmt::{self, Debug, Formatter}; +use std::hash::{Hash, Hasher}; use std::ops::{Add, AddAssign, Deref}; use std::sync::Arc; @@ -39,18 +40,44 @@ use crate::foundations::{cast, func, scope, ty, Array, Reflect, Repr, Str, Value /// #str(data.slice(1, 4)) /// ``` #[ty(scope, cast)] -#[derive(Clone, Hash, Eq, PartialEq)] -pub struct Bytes(Arc<LazyHash<Cow<'static, [u8]>>>); +#[derive(Clone, Hash)] +#[allow(clippy::derived_hash_with_manual_eq)] +pub struct Bytes(Arc<LazyHash<dyn Bytelike>>); impl Bytes { - /// Create a buffer from a static byte slice. - pub fn from_static(slice: &'static [u8]) -> Self { - Self(Arc::new(LazyHash::new(Cow::Borrowed(slice)))) + /// Create `Bytes` from anything byte-like. + /// + /// The `data` type will directly back this bytes object. This means you can + /// e.g. pass `&'static [u8]` or `[u8; 8]` and no extra vector will be + /// allocated. + /// + /// If the type is `Vec<u8>` and the `Bytes` are unique (i.e. not cloned), + /// the vector will be reused when mutating to the `Bytes`. + /// + /// If your source type is a string, prefer [`Bytes::from_string`] to + /// directly use the UTF-8 encoded string data without any copying. + pub fn new<T>(data: T) -> Self + where + T: AsRef<[u8]> + Send + Sync + 'static, + { + Self(Arc::new(LazyHash::new(data))) + } + + /// Create `Bytes` from anything string-like, implicitly viewing the UTF-8 + /// representation. + /// + /// The `data` type will directly back this bytes object. This means you can + /// e.g. pass `String` or `EcoString` without any copying. + pub fn from_string<T>(data: T) -> Self + where + T: AsRef<str> + Send + Sync + 'static, + { + Self(Arc::new(LazyHash::new(StrWrapper(data)))) } /// Return `true` if the length is 0. pub fn is_empty(&self) -> bool { - self.0.is_empty() + self.as_slice().is_empty() } /// Return a view into the buffer. @@ -60,7 +87,7 @@ impl Bytes { /// Return a copy of the buffer as a vector. pub fn to_vec(&self) -> Vec<u8> { - self.0.to_vec() + self.as_slice().to_vec() } /// Resolve an index or throw an out of bounds error. @@ -72,12 +99,10 @@ impl Bytes { /// /// `index == len` is considered in bounds. fn locate_opt(&self, index: i64) -> Option<usize> { + let len = self.as_slice().len(); let wrapped = - if index >= 0 { Some(index) } else { (self.len() as i64).checked_add(index) }; - - wrapped - .and_then(|v| usize::try_from(v).ok()) - .filter(|&v| v <= self.0.len()) + if index >= 0 { Some(index) } else { (len as i64).checked_add(index) }; + wrapped.and_then(|v| usize::try_from(v).ok()).filter(|&v| v <= len) } } @@ -106,7 +131,7 @@ impl Bytes { /// The length in bytes. #[func(title = "Length")] pub fn len(&self) -> usize { - self.0.len() + self.as_slice().len() } /// Returns the byte at the specified index. Returns the default value if @@ -122,13 +147,13 @@ impl Bytes { default: Option<Value>, ) -> StrResult<Value> { self.locate_opt(index) - .and_then(|i| self.0.get(i).map(|&b| Value::Int(b.into()))) + .and_then(|i| self.as_slice().get(i).map(|&b| Value::Int(b.into()))) .or(default) .ok_or_else(|| out_of_bounds_no_default(index, self.len())) } - /// Extracts a subslice of the bytes. Fails with an error if the start or end - /// index is out of bounds. + /// Extracts a subslice of the bytes. Fails with an error if the start or + /// end index is out of bounds. #[func] pub fn slice( &self, @@ -148,9 +173,17 @@ impl Bytes { if end.is_none() { end = count.map(|c: i64| start + c); } + let start = self.locate(start)?; let end = self.locate(end.unwrap_or(self.len() as i64))?.max(start); - Ok(self.0[start..end].into()) + let slice = &self.as_slice()[start..end]; + + // We could hold a view into the original bytes here instead of + // making a copy, but it's unclear when that's worth it. Java + // originally did that for strings, but went back on it because a + // very small view into a very large buffer would be a sort of + // memory leak. + Ok(Bytes::new(slice.to_vec())) } } @@ -170,25 +203,21 @@ impl Deref for Bytes { type Target = [u8]; fn deref(&self) -> &Self::Target { - &self.0 + self.0.as_bytes() } } -impl AsRef<[u8]> for Bytes { - fn as_ref(&self) -> &[u8] { - self - } -} +impl Eq for Bytes {} -impl From<&[u8]> for Bytes { - fn from(slice: &[u8]) -> Self { - Self(Arc::new(LazyHash::new(slice.to_vec().into()))) +impl PartialEq for Bytes { + fn eq(&self, other: &Self) -> bool { + self.0.eq(&other.0) } } -impl From<Vec<u8>> for Bytes { - fn from(vec: Vec<u8>) -> Self { - Self(Arc::new(LazyHash::new(vec.into()))) +impl AsRef<[u8]> for Bytes { + fn as_ref(&self) -> &[u8] { + self } } @@ -207,10 +236,12 @@ impl AddAssign for Bytes { // Nothing to do } else if self.is_empty() { *self = rhs; - } else if Arc::strong_count(&self.0) == 1 && matches!(**self.0, Cow::Owned(_)) { - Arc::make_mut(&mut self.0).to_mut().extend_from_slice(&rhs); + } else if let Some(vec) = Arc::get_mut(&mut self.0) + .and_then(|unique| unique.as_any_mut().downcast_mut::<Vec<u8>>()) + { + vec.extend_from_slice(&rhs); } else { - *self = Self::from([self.as_slice(), rhs.as_slice()].concat()); + *self = Self::new([self.as_slice(), rhs.as_slice()].concat()); } } } @@ -228,20 +259,61 @@ impl Serialize for Bytes { } } +/// Any type that can back a byte buffer. +trait Bytelike: Send + Sync { + fn as_bytes(&self) -> &[u8]; + fn as_any_mut(&mut self) -> &mut dyn Any; +} + +impl<T> Bytelike for T +where + T: AsRef<[u8]> + Send + Sync + 'static, +{ + fn as_bytes(&self) -> &[u8] { + self.as_ref() + } + + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } +} + +impl Hash for dyn Bytelike { + fn hash<H: Hasher>(&self, state: &mut H) { + self.as_bytes().hash(state); + } +} + +/// Makes string-like objects usable with `Bytes`. +struct StrWrapper<T>(T); + +impl<T> Bytelike for StrWrapper<T> +where + T: AsRef<str> + Send + Sync + 'static, +{ + fn as_bytes(&self) -> &[u8] { + self.0.as_ref().as_bytes() + } + + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } +} + /// A value that can be cast to bytes. pub struct ToBytes(Bytes); cast! { ToBytes, - v: Str => Self(v.as_bytes().into()), + v: Str => Self(Bytes::from_string(v)), v: Array => Self(v.iter() .map(|item| match item { Value::Int(byte @ 0..=255) => Ok(*byte as u8), Value::Int(_) => bail!("number must be between 0 and 255"), value => Err(<u8 as Reflect>::error(value)), }) - .collect::<Result<Vec<u8>, _>>()? - .into() + .collect::<Result<Vec<u8>, _>>() + .map(Bytes::new)? ), v: Bytes => Self(v), } diff --git a/crates/typst-library/src/foundations/float.rs b/crates/typst-library/src/foundations/float.rs index c3d4e0e7..fcc46b03 100644 --- a/crates/typst-library/src/foundations/float.rs +++ b/crates/typst-library/src/foundations/float.rs @@ -163,18 +163,14 @@ impl f64 { size: u32, ) -> StrResult<Bytes> { Ok(match size { - 8 => match endian { + 8 => Bytes::new(match endian { Endianness::Little => self.to_le_bytes(), Endianness::Big => self.to_be_bytes(), - } - .as_slice() - .into(), - 4 => match endian { + }), + 4 => Bytes::new(match endian { Endianness::Little => (self as f32).to_le_bytes(), Endianness::Big => (self as f32).to_be_bytes(), - } - .as_slice() - .into(), + }), _ => bail!("size must be either 4 or 8"), }) } diff --git a/crates/typst-library/src/foundations/int.rs b/crates/typst-library/src/foundations/int.rs index bddffada..83a89bf8 100644 --- a/crates/typst-library/src/foundations/int.rs +++ b/crates/typst-library/src/foundations/int.rs @@ -1,6 +1,7 @@ use std::num::{NonZeroI64, NonZeroIsize, NonZeroU64, NonZeroUsize, ParseIntError}; use ecow::{eco_format, EcoString}; +use smallvec::SmallVec; use crate::diag::{bail, StrResult}; use crate::foundations::{ @@ -322,7 +323,7 @@ impl i64 { Endianness::Little => self.to_le_bytes(), }; - let mut buf = vec![0u8; size]; + let mut buf = SmallVec::<[u8; 8]>::from_elem(0, size); match endian { Endianness::Big => { // Copy the bytes from the array to the buffer, starting from @@ -339,7 +340,7 @@ impl i64 { } } - Bytes::from(buf) + Bytes::new(buf) } } diff --git a/crates/typst-library/src/foundations/plugin.rs b/crates/typst-library/src/foundations/plugin.rs index f57257a4..a7c341d8 100644 --- a/crates/typst-library/src/foundations/plugin.rs +++ b/crates/typst-library/src/foundations/plugin.rs @@ -293,7 +293,7 @@ impl Plugin { _ => bail!("plugin did not respect the protocol"), }; - Ok(output.into()) + Ok(Bytes::new(output)) } /// An iterator over all the function names defined by the plugin. diff --git a/crates/typst-library/src/foundations/value.rs b/crates/typst-library/src/foundations/value.rs index eb0d6eed..efc480d3 100644 --- a/crates/typst-library/src/foundations/value.rs +++ b/crates/typst-library/src/foundations/value.rs @@ -459,15 +459,15 @@ impl<'de> Visitor<'de> for ValueVisitor { } fn visit_bytes<E: Error>(self, v: &[u8]) -> Result<Self::Value, E> { - Ok(Bytes::from(v).into_value()) + Ok(Bytes::new(v.to_vec()).into_value()) } fn visit_borrowed_bytes<E: Error>(self, v: &'de [u8]) -> Result<Self::Value, E> { - Ok(Bytes::from(v).into_value()) + Ok(Bytes::new(v.to_vec()).into_value()) } fn visit_byte_buf<E: Error>(self, v: Vec<u8>) -> Result<Self::Value, E> { - Ok(Bytes::from(v).into_value()) + Ok(Bytes::new(v).into_value()) } fn visit_none<E: Error>(self) -> Result<Self::Value, E> { |
