diff options
Diffstat (limited to 'src/eval/str.rs')
| -rw-r--r-- | src/eval/str.rs | 514 |
1 files changed, 514 insertions, 0 deletions
diff --git a/src/eval/str.rs b/src/eval/str.rs new file mode 100644 index 00000000..63ea5dc8 --- /dev/null +++ b/src/eval/str.rs @@ -0,0 +1,514 @@ +use std::borrow::{Borrow, Cow}; +use std::fmt::{self, Debug, Display, Formatter, Write}; +use std::hash::{Hash, Hasher}; +use std::ops::{Add, AddAssign, Deref}; + +use ecow::EcoString; +use unicode_segmentation::UnicodeSegmentation; + +use super::{castable, dict, Array, Dict, Value}; +use crate::diag::StrResult; +use crate::geom::GenAlign; + +/// Create a new [`Str`] from a format string. +#[macro_export] +#[doc(hidden)] +macro_rules! __format_str { + ($($tts:tt)*) => {{ + $crate::eval::Str::from($crate::eval::eco_format!($($tts)*)) + }}; +} + +#[doc(inline)] +pub use crate::__format_str as format_str; +#[doc(hidden)] +pub use ecow::eco_format; + +/// An immutable reference counted string. +#[derive(Default, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)] +pub struct Str(EcoString); + +impl Str { + /// Create a new, empty string. + pub fn new() -> Self { + Self(EcoString::new()) + } + + /// The length of the string in bytes. + pub fn len(&self) -> i64 { + self.0.len() as i64 + } + + /// A string slice containing the entire string. + pub fn as_str(&self) -> &str { + self + } + + /// Extract the first grapheme cluster. + pub fn first(&self) -> StrResult<Self> { + self.0 + .graphemes(true) + .next() + .map(Into::into) + .ok_or_else(string_is_empty) + } + + /// Extract the last grapheme cluster. + pub fn last(&self) -> StrResult<Self> { + self.0 + .graphemes(true) + .next_back() + .map(Into::into) + .ok_or_else(string_is_empty) + } + + /// Extract the grapheme cluster at the given index. + pub fn at(&self, index: i64) -> StrResult<Self> { + let len = self.len(); + let grapheme = self.0[self.locate(index)?..] + .graphemes(true) + .next() + .ok_or_else(|| out_of_bounds(index, len))?; + Ok(grapheme.into()) + } + + /// Extract a contigous substring. + pub fn slice(&self, start: i64, end: Option<i64>) -> StrResult<Self> { + let start = self.locate(start)?; + let end = self.locate(end.unwrap_or(self.len()))?.max(start); + Ok(self.0[start..end].into()) + } + + /// The grapheme clusters the string consists of. + pub fn clusters(&self) -> Array { + self.as_str().graphemes(true).map(|s| Value::Str(s.into())).collect() + } + + /// The codepoints the string consists of. + pub fn codepoints(&self) -> Array { + self.chars().map(|c| Value::Str(c.into())).collect() + } + + /// Whether the given pattern exists in this string. + pub fn contains(&self, pattern: StrPattern) -> bool { + match pattern { + StrPattern::Str(pat) => self.0.contains(pat.as_str()), + StrPattern::Regex(re) => re.is_match(self), + } + } + + /// Whether this string begins with the given pattern. + pub fn starts_with(&self, pattern: StrPattern) -> bool { + match pattern { + StrPattern::Str(pat) => self.0.starts_with(pat.as_str()), + StrPattern::Regex(re) => re.find(self).map_or(false, |m| m.start() == 0), + } + } + + /// Whether this string ends with the given pattern. + pub fn ends_with(&self, pattern: StrPattern) -> bool { + match pattern { + StrPattern::Str(pat) => self.0.ends_with(pat.as_str()), + StrPattern::Regex(re) => { + re.find_iter(self).last().map_or(false, |m| m.end() == self.0.len()) + } + } + } + + /// The text of the pattern's first match in this string. + pub fn find(&self, pattern: StrPattern) -> Option<Self> { + match pattern { + StrPattern::Str(pat) => self.0.contains(pat.as_str()).then(|| pat), + StrPattern::Regex(re) => re.find(self).map(|m| m.as_str().into()), + } + } + + /// The position of the pattern's first match in this string. + pub fn position(&self, pattern: StrPattern) -> Option<i64> { + match pattern { + StrPattern::Str(pat) => self.0.find(pat.as_str()).map(|i| i as i64), + StrPattern::Regex(re) => re.find(self).map(|m| m.start() as i64), + } + } + + /// The start and, text and capture groups (if any) of the first match of + /// the pattern in this string. + pub fn match_(&self, pattern: StrPattern) -> Option<Dict> { + match pattern { + StrPattern::Str(pat) => { + self.0.match_indices(pat.as_str()).next().map(match_to_dict) + } + StrPattern::Regex(re) => re.captures(self).map(captures_to_dict), + } + } + + /// The start, end, text and capture groups (if any) of all matches of the + /// pattern in this string. + pub fn matches(&self, pattern: StrPattern) -> Array { + match pattern { + StrPattern::Str(pat) => self + .0 + .match_indices(pat.as_str()) + .map(match_to_dict) + .map(Value::Dict) + .collect(), + StrPattern::Regex(re) => re + .captures_iter(self) + .map(captures_to_dict) + .map(Value::Dict) + .collect(), + } + } + + /// Split this string at whitespace or a specific pattern. + pub fn split(&self, pattern: Option<StrPattern>) -> Array { + let s = self.as_str(); + match pattern { + None => s.split_whitespace().map(|v| Value::Str(v.into())).collect(), + Some(StrPattern::Str(pat)) => { + s.split(pat.as_str()).map(|v| Value::Str(v.into())).collect() + } + Some(StrPattern::Regex(re)) => { + re.split(s).map(|v| Value::Str(v.into())).collect() + } + } + } + + /// Trim either whitespace or the given pattern at both or just one side of + /// the string. If `repeat` is true, the pattern is trimmed repeatedly + /// instead of just once. Repeat must only be given in combination with a + /// pattern. + pub fn trim( + &self, + pattern: Option<StrPattern>, + at: Option<StrSide>, + repeat: bool, + ) -> Self { + let mut start = matches!(at, Some(StrSide::Start) | None); + let end = matches!(at, Some(StrSide::End) | None); + + let trimmed = match pattern { + None => match at { + None => self.0.trim(), + Some(StrSide::Start) => self.0.trim_start(), + Some(StrSide::End) => self.0.trim_end(), + }, + Some(StrPattern::Str(pat)) => { + let pat = pat.as_str(); + let mut s = self.as_str(); + if repeat { + if start { + s = s.trim_start_matches(pat); + } + if end { + s = s.trim_end_matches(pat); + } + } else { + if start { + s = s.strip_prefix(pat).unwrap_or(s); + } + if end { + s = s.strip_suffix(pat).unwrap_or(s); + } + } + s + } + Some(StrPattern::Regex(re)) => { + let s = self.as_str(); + let mut last = 0; + let mut range = 0..s.len(); + + for m in re.find_iter(s) { + // Does this match follow directly after the last one? + let consecutive = last == m.start(); + + // As long as we're consecutive and still trimming at the + // start, trim. + start &= consecutive; + if start { + range.start = m.end(); + start &= repeat; + } + + // Reset end trim if we aren't consecutive anymore or aren't + // repeating. + if end && (!consecutive || !repeat) { + range.end = m.start(); + } + + last = m.end(); + } + + // Is the last match directly at the end? + if last < s.len() { + range.end = s.len(); + } + + &s[range.start..range.start.max(range.end)] + } + }; + + trimmed.into() + } + + /// Replace at most `count` occurances of the given pattern with a + /// replacement string (beginning from the start). + pub fn replace(&self, pattern: StrPattern, with: Self, count: Option<usize>) -> Self { + match pattern { + StrPattern::Str(pat) => match count { + Some(n) => self.0.replacen(pat.as_str(), &with, n).into(), + None => self.0.replace(pat.as_str(), &with).into(), + }, + StrPattern::Regex(re) => match count { + Some(n) => re.replacen(self, n, with.as_str()).into(), + None => re.replace(self, with.as_str()).into(), + }, + } + } + + /// Repeat the string a number of times. + pub fn repeat(&self, n: i64) -> StrResult<Self> { + let n = usize::try_from(n) + .ok() + .and_then(|n| self.0.len().checked_mul(n).map(|_| n)) + .ok_or_else(|| format!("cannot repeat this string {} times", n))?; + + Ok(Self(self.0.repeat(n))) + } + + /// Resolve an index. + fn locate(&self, index: i64) -> StrResult<usize> { + let wrapped = + if index >= 0 { Some(index) } else { self.len().checked_add(index) }; + + let resolved = wrapped + .and_then(|v| usize::try_from(v).ok()) + .filter(|&v| v <= self.0.len()) + .ok_or_else(|| out_of_bounds(index, self.len()))?; + + if !self.0.is_char_boundary(resolved) { + return Err(not_a_char_boundary(index)); + } + + Ok(resolved) + } +} + +/// The out of bounds access error message. +#[cold] +fn out_of_bounds(index: i64, len: i64) -> EcoString { + eco_format!("string index out of bounds (index: {}, len: {})", index, len) +} + +/// The char boundary access error message. +#[cold] +fn not_a_char_boundary(index: i64) -> EcoString { + eco_format!("string index {} is not a character boundary", index) +} + +/// The error message when the string is empty. +#[cold] +fn string_is_empty() -> EcoString { + "string is empty".into() +} + +/// Convert an item of std's `match_indices` to a dictionary. +fn match_to_dict((start, text): (usize, &str)) -> Dict { + dict! { + "start" => Value::Int(start as i64), + "end" => Value::Int((start + text.len()) as i64), + "text" => Value::Str(text.into()), + "captures" => Value::Array(Array::new()), + } +} + +/// Convert regex captures to a dictionary. +fn captures_to_dict(cap: regex::Captures) -> Dict { + let m = cap.get(0).expect("missing first match"); + dict! { + "start" => Value::Int(m.start() as i64), + "end" => Value::Int(m.end() as i64), + "text" => Value::Str(m.as_str().into()), + "captures" => Value::Array( + cap.iter() + .skip(1) + .map(|opt| opt.map_or(Value::None, |m| m.as_str().into())) + .collect(), + ), + } +} + +impl Deref for Str { + type Target = str; + + fn deref(&self) -> &str { + &self.0 + } +} + +impl Display for Str { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + f.pad(self) + } +} + +impl Debug for Str { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + f.write_char('"')?; + for c in self.chars() { + match c { + '\0' => f.write_str("\\u{0}")?, + '\'' => f.write_str("'")?, + '"' => f.write_str(r#"\""#)?, + _ => Display::fmt(&c.escape_debug(), f)?, + } + } + f.write_char('"') + } +} + +impl Add for Str { + type Output = Self; + + fn add(mut self, rhs: Self) -> Self::Output { + self += rhs; + self + } +} + +impl AddAssign for Str { + fn add_assign(&mut self, rhs: Self) { + self.0.push_str(rhs.as_str()); + } +} + +impl AsRef<str> for Str { + fn as_ref(&self) -> &str { + self + } +} + +impl Borrow<str> for Str { + fn borrow(&self) -> &str { + self + } +} + +impl From<char> for Str { + fn from(c: char) -> Self { + Self(c.into()) + } +} + +impl From<&str> for Str { + fn from(s: &str) -> Self { + Self(s.into()) + } +} + +impl From<EcoString> for Str { + fn from(s: EcoString) -> Self { + Self(s) + } +} + +impl From<String> for Str { + fn from(s: String) -> Self { + Self(s.into()) + } +} + +impl From<Cow<'_, str>> for Str { + fn from(s: Cow<str>) -> Self { + Self(s.into()) + } +} + +impl FromIterator<char> for Str { + fn from_iter<T: IntoIterator<Item = char>>(iter: T) -> Self { + Self(iter.into_iter().collect()) + } +} + +impl From<Str> for EcoString { + fn from(str: Str) -> Self { + str.0 + } +} + +impl From<Str> for String { + fn from(s: Str) -> Self { + s.0.into() + } +} + +/// A regular expression. +#[derive(Clone)] +pub struct Regex(regex::Regex); + +impl Regex { + /// Create a new regular expression. + pub fn new(re: &str) -> StrResult<Self> { + regex::Regex::new(re).map(Self).map_err(|err| eco_format!("{err}")) + } +} + +impl Deref for Regex { + type Target = regex::Regex; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl Debug for Regex { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!(f, "regex({:?})", self.0.as_str()) + } +} + +impl PartialEq for Regex { + fn eq(&self, other: &Self) -> bool { + self.0.as_str() == other.0.as_str() + } +} + +impl Hash for Regex { + fn hash<H: Hasher>(&self, state: &mut H) { + self.0.as_str().hash(state); + } +} + +/// A pattern which can be searched for in a string. +#[derive(Debug, Clone)] +pub enum StrPattern { + /// Just a string. + Str(Str), + /// A regular expression. + Regex(Regex), +} + +castable! { + StrPattern, + text: Str => Self::Str(text), + regex: Regex => Self::Regex(regex), +} + +/// A side of a string. +#[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd)] +pub enum StrSide { + /// The logical start of the string, may be left or right depending on the + /// language. + Start, + /// The logical end of the string. + End, +} + +castable! { + StrSide, + align: GenAlign => match align { + GenAlign::Start => Self::Start, + GenAlign::End => Self::End, + _ => Err("expected either `start` or `end`")?, + }, +} |
