summaryrefslogtreecommitdiff
path: root/src/eval/str.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/eval/str.rs')
-rw-r--r--src/eval/str.rs514
1 files changed, 514 insertions, 0 deletions
diff --git a/src/eval/str.rs b/src/eval/str.rs
new file mode 100644
index 00000000..63ea5dc8
--- /dev/null
+++ b/src/eval/str.rs
@@ -0,0 +1,514 @@
+use std::borrow::{Borrow, Cow};
+use std::fmt::{self, Debug, Display, Formatter, Write};
+use std::hash::{Hash, Hasher};
+use std::ops::{Add, AddAssign, Deref};
+
+use ecow::EcoString;
+use unicode_segmentation::UnicodeSegmentation;
+
+use super::{castable, dict, Array, Dict, Value};
+use crate::diag::StrResult;
+use crate::geom::GenAlign;
+
+/// Create a new [`Str`] from a format string.
+#[macro_export]
+#[doc(hidden)]
+macro_rules! __format_str {
+ ($($tts:tt)*) => {{
+ $crate::eval::Str::from($crate::eval::eco_format!($($tts)*))
+ }};
+}
+
+#[doc(inline)]
+pub use crate::__format_str as format_str;
+#[doc(hidden)]
+pub use ecow::eco_format;
+
+/// An immutable reference counted string.
+#[derive(Default, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)]
+pub struct Str(EcoString);
+
+impl Str {
+ /// Create a new, empty string.
+ pub fn new() -> Self {
+ Self(EcoString::new())
+ }
+
+ /// The length of the string in bytes.
+ pub fn len(&self) -> i64 {
+ self.0.len() as i64
+ }
+
+ /// A string slice containing the entire string.
+ pub fn as_str(&self) -> &str {
+ self
+ }
+
+ /// Extract the first grapheme cluster.
+ pub fn first(&self) -> StrResult<Self> {
+ self.0
+ .graphemes(true)
+ .next()
+ .map(Into::into)
+ .ok_or_else(string_is_empty)
+ }
+
+ /// Extract the last grapheme cluster.
+ pub fn last(&self) -> StrResult<Self> {
+ self.0
+ .graphemes(true)
+ .next_back()
+ .map(Into::into)
+ .ok_or_else(string_is_empty)
+ }
+
+ /// Extract the grapheme cluster at the given index.
+ pub fn at(&self, index: i64) -> StrResult<Self> {
+ let len = self.len();
+ let grapheme = self.0[self.locate(index)?..]
+ .graphemes(true)
+ .next()
+ .ok_or_else(|| out_of_bounds(index, len))?;
+ Ok(grapheme.into())
+ }
+
+ /// Extract a contigous substring.
+ pub fn slice(&self, start: i64, end: Option<i64>) -> StrResult<Self> {
+ let start = self.locate(start)?;
+ let end = self.locate(end.unwrap_or(self.len()))?.max(start);
+ Ok(self.0[start..end].into())
+ }
+
+ /// The grapheme clusters the string consists of.
+ pub fn clusters(&self) -> Array {
+ self.as_str().graphemes(true).map(|s| Value::Str(s.into())).collect()
+ }
+
+ /// The codepoints the string consists of.
+ pub fn codepoints(&self) -> Array {
+ self.chars().map(|c| Value::Str(c.into())).collect()
+ }
+
+ /// Whether the given pattern exists in this string.
+ pub fn contains(&self, pattern: StrPattern) -> bool {
+ match pattern {
+ StrPattern::Str(pat) => self.0.contains(pat.as_str()),
+ StrPattern::Regex(re) => re.is_match(self),
+ }
+ }
+
+ /// Whether this string begins with the given pattern.
+ pub fn starts_with(&self, pattern: StrPattern) -> bool {
+ match pattern {
+ StrPattern::Str(pat) => self.0.starts_with(pat.as_str()),
+ StrPattern::Regex(re) => re.find(self).map_or(false, |m| m.start() == 0),
+ }
+ }
+
+ /// Whether this string ends with the given pattern.
+ pub fn ends_with(&self, pattern: StrPattern) -> bool {
+ match pattern {
+ StrPattern::Str(pat) => self.0.ends_with(pat.as_str()),
+ StrPattern::Regex(re) => {
+ re.find_iter(self).last().map_or(false, |m| m.end() == self.0.len())
+ }
+ }
+ }
+
+ /// The text of the pattern's first match in this string.
+ pub fn find(&self, pattern: StrPattern) -> Option<Self> {
+ match pattern {
+ StrPattern::Str(pat) => self.0.contains(pat.as_str()).then(|| pat),
+ StrPattern::Regex(re) => re.find(self).map(|m| m.as_str().into()),
+ }
+ }
+
+ /// The position of the pattern's first match in this string.
+ pub fn position(&self, pattern: StrPattern) -> Option<i64> {
+ match pattern {
+ StrPattern::Str(pat) => self.0.find(pat.as_str()).map(|i| i as i64),
+ StrPattern::Regex(re) => re.find(self).map(|m| m.start() as i64),
+ }
+ }
+
+ /// The start and, text and capture groups (if any) of the first match of
+ /// the pattern in this string.
+ pub fn match_(&self, pattern: StrPattern) -> Option<Dict> {
+ match pattern {
+ StrPattern::Str(pat) => {
+ self.0.match_indices(pat.as_str()).next().map(match_to_dict)
+ }
+ StrPattern::Regex(re) => re.captures(self).map(captures_to_dict),
+ }
+ }
+
+ /// The start, end, text and capture groups (if any) of all matches of the
+ /// pattern in this string.
+ pub fn matches(&self, pattern: StrPattern) -> Array {
+ match pattern {
+ StrPattern::Str(pat) => self
+ .0
+ .match_indices(pat.as_str())
+ .map(match_to_dict)
+ .map(Value::Dict)
+ .collect(),
+ StrPattern::Regex(re) => re
+ .captures_iter(self)
+ .map(captures_to_dict)
+ .map(Value::Dict)
+ .collect(),
+ }
+ }
+
+ /// Split this string at whitespace or a specific pattern.
+ pub fn split(&self, pattern: Option<StrPattern>) -> Array {
+ let s = self.as_str();
+ match pattern {
+ None => s.split_whitespace().map(|v| Value::Str(v.into())).collect(),
+ Some(StrPattern::Str(pat)) => {
+ s.split(pat.as_str()).map(|v| Value::Str(v.into())).collect()
+ }
+ Some(StrPattern::Regex(re)) => {
+ re.split(s).map(|v| Value::Str(v.into())).collect()
+ }
+ }
+ }
+
+ /// Trim either whitespace or the given pattern at both or just one side of
+ /// the string. If `repeat` is true, the pattern is trimmed repeatedly
+ /// instead of just once. Repeat must only be given in combination with a
+ /// pattern.
+ pub fn trim(
+ &self,
+ pattern: Option<StrPattern>,
+ at: Option<StrSide>,
+ repeat: bool,
+ ) -> Self {
+ let mut start = matches!(at, Some(StrSide::Start) | None);
+ let end = matches!(at, Some(StrSide::End) | None);
+
+ let trimmed = match pattern {
+ None => match at {
+ None => self.0.trim(),
+ Some(StrSide::Start) => self.0.trim_start(),
+ Some(StrSide::End) => self.0.trim_end(),
+ },
+ Some(StrPattern::Str(pat)) => {
+ let pat = pat.as_str();
+ let mut s = self.as_str();
+ if repeat {
+ if start {
+ s = s.trim_start_matches(pat);
+ }
+ if end {
+ s = s.trim_end_matches(pat);
+ }
+ } else {
+ if start {
+ s = s.strip_prefix(pat).unwrap_or(s);
+ }
+ if end {
+ s = s.strip_suffix(pat).unwrap_or(s);
+ }
+ }
+ s
+ }
+ Some(StrPattern::Regex(re)) => {
+ let s = self.as_str();
+ let mut last = 0;
+ let mut range = 0..s.len();
+
+ for m in re.find_iter(s) {
+ // Does this match follow directly after the last one?
+ let consecutive = last == m.start();
+
+ // As long as we're consecutive and still trimming at the
+ // start, trim.
+ start &= consecutive;
+ if start {
+ range.start = m.end();
+ start &= repeat;
+ }
+
+ // Reset end trim if we aren't consecutive anymore or aren't
+ // repeating.
+ if end && (!consecutive || !repeat) {
+ range.end = m.start();
+ }
+
+ last = m.end();
+ }
+
+ // Is the last match directly at the end?
+ if last < s.len() {
+ range.end = s.len();
+ }
+
+ &s[range.start..range.start.max(range.end)]
+ }
+ };
+
+ trimmed.into()
+ }
+
+ /// Replace at most `count` occurances of the given pattern with a
+ /// replacement string (beginning from the start).
+ pub fn replace(&self, pattern: StrPattern, with: Self, count: Option<usize>) -> Self {
+ match pattern {
+ StrPattern::Str(pat) => match count {
+ Some(n) => self.0.replacen(pat.as_str(), &with, n).into(),
+ None => self.0.replace(pat.as_str(), &with).into(),
+ },
+ StrPattern::Regex(re) => match count {
+ Some(n) => re.replacen(self, n, with.as_str()).into(),
+ None => re.replace(self, with.as_str()).into(),
+ },
+ }
+ }
+
+ /// Repeat the string a number of times.
+ pub fn repeat(&self, n: i64) -> StrResult<Self> {
+ let n = usize::try_from(n)
+ .ok()
+ .and_then(|n| self.0.len().checked_mul(n).map(|_| n))
+ .ok_or_else(|| format!("cannot repeat this string {} times", n))?;
+
+ Ok(Self(self.0.repeat(n)))
+ }
+
+ /// Resolve an index.
+ fn locate(&self, index: i64) -> StrResult<usize> {
+ let wrapped =
+ if index >= 0 { Some(index) } else { self.len().checked_add(index) };
+
+ let resolved = wrapped
+ .and_then(|v| usize::try_from(v).ok())
+ .filter(|&v| v <= self.0.len())
+ .ok_or_else(|| out_of_bounds(index, self.len()))?;
+
+ if !self.0.is_char_boundary(resolved) {
+ return Err(not_a_char_boundary(index));
+ }
+
+ Ok(resolved)
+ }
+}
+
+/// The out of bounds access error message.
+#[cold]
+fn out_of_bounds(index: i64, len: i64) -> EcoString {
+ eco_format!("string index out of bounds (index: {}, len: {})", index, len)
+}
+
+/// The char boundary access error message.
+#[cold]
+fn not_a_char_boundary(index: i64) -> EcoString {
+ eco_format!("string index {} is not a character boundary", index)
+}
+
+/// The error message when the string is empty.
+#[cold]
+fn string_is_empty() -> EcoString {
+ "string is empty".into()
+}
+
+/// Convert an item of std's `match_indices` to a dictionary.
+fn match_to_dict((start, text): (usize, &str)) -> Dict {
+ dict! {
+ "start" => Value::Int(start as i64),
+ "end" => Value::Int((start + text.len()) as i64),
+ "text" => Value::Str(text.into()),
+ "captures" => Value::Array(Array::new()),
+ }
+}
+
+/// Convert regex captures to a dictionary.
+fn captures_to_dict(cap: regex::Captures) -> Dict {
+ let m = cap.get(0).expect("missing first match");
+ dict! {
+ "start" => Value::Int(m.start() as i64),
+ "end" => Value::Int(m.end() as i64),
+ "text" => Value::Str(m.as_str().into()),
+ "captures" => Value::Array(
+ cap.iter()
+ .skip(1)
+ .map(|opt| opt.map_or(Value::None, |m| m.as_str().into()))
+ .collect(),
+ ),
+ }
+}
+
+impl Deref for Str {
+ type Target = str;
+
+ fn deref(&self) -> &str {
+ &self.0
+ }
+}
+
+impl Display for Str {
+ fn fmt(&self, f: &mut Formatter) -> fmt::Result {
+ f.pad(self)
+ }
+}
+
+impl Debug for Str {
+ fn fmt(&self, f: &mut Formatter) -> fmt::Result {
+ f.write_char('"')?;
+ for c in self.chars() {
+ match c {
+ '\0' => f.write_str("\\u{0}")?,
+ '\'' => f.write_str("'")?,
+ '"' => f.write_str(r#"\""#)?,
+ _ => Display::fmt(&c.escape_debug(), f)?,
+ }
+ }
+ f.write_char('"')
+ }
+}
+
+impl Add for Str {
+ type Output = Self;
+
+ fn add(mut self, rhs: Self) -> Self::Output {
+ self += rhs;
+ self
+ }
+}
+
+impl AddAssign for Str {
+ fn add_assign(&mut self, rhs: Self) {
+ self.0.push_str(rhs.as_str());
+ }
+}
+
+impl AsRef<str> for Str {
+ fn as_ref(&self) -> &str {
+ self
+ }
+}
+
+impl Borrow<str> for Str {
+ fn borrow(&self) -> &str {
+ self
+ }
+}
+
+impl From<char> for Str {
+ fn from(c: char) -> Self {
+ Self(c.into())
+ }
+}
+
+impl From<&str> for Str {
+ fn from(s: &str) -> Self {
+ Self(s.into())
+ }
+}
+
+impl From<EcoString> for Str {
+ fn from(s: EcoString) -> Self {
+ Self(s)
+ }
+}
+
+impl From<String> for Str {
+ fn from(s: String) -> Self {
+ Self(s.into())
+ }
+}
+
+impl From<Cow<'_, str>> for Str {
+ fn from(s: Cow<str>) -> Self {
+ Self(s.into())
+ }
+}
+
+impl FromIterator<char> for Str {
+ fn from_iter<T: IntoIterator<Item = char>>(iter: T) -> Self {
+ Self(iter.into_iter().collect())
+ }
+}
+
+impl From<Str> for EcoString {
+ fn from(str: Str) -> Self {
+ str.0
+ }
+}
+
+impl From<Str> for String {
+ fn from(s: Str) -> Self {
+ s.0.into()
+ }
+}
+
+/// A regular expression.
+#[derive(Clone)]
+pub struct Regex(regex::Regex);
+
+impl Regex {
+ /// Create a new regular expression.
+ pub fn new(re: &str) -> StrResult<Self> {
+ regex::Regex::new(re).map(Self).map_err(|err| eco_format!("{err}"))
+ }
+}
+
+impl Deref for Regex {
+ type Target = regex::Regex;
+
+ fn deref(&self) -> &Self::Target {
+ &self.0
+ }
+}
+
+impl Debug for Regex {
+ fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+ write!(f, "regex({:?})", self.0.as_str())
+ }
+}
+
+impl PartialEq for Regex {
+ fn eq(&self, other: &Self) -> bool {
+ self.0.as_str() == other.0.as_str()
+ }
+}
+
+impl Hash for Regex {
+ fn hash<H: Hasher>(&self, state: &mut H) {
+ self.0.as_str().hash(state);
+ }
+}
+
+/// A pattern which can be searched for in a string.
+#[derive(Debug, Clone)]
+pub enum StrPattern {
+ /// Just a string.
+ Str(Str),
+ /// A regular expression.
+ Regex(Regex),
+}
+
+castable! {
+ StrPattern,
+ text: Str => Self::Str(text),
+ regex: Regex => Self::Regex(regex),
+}
+
+/// A side of a string.
+#[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd)]
+pub enum StrSide {
+ /// The logical start of the string, may be left or right depending on the
+ /// language.
+ Start,
+ /// The logical end of the string.
+ End,
+}
+
+castable! {
+ StrSide,
+ align: GenAlign => match align {
+ GenAlign::Start => Self::Start,
+ GenAlign::End => Self::End,
+ _ => Err("expected either `start` or `end`")?,
+ },
+}