diff options
| author | Laurenz <laurmaedje@gmail.com> | 2024-10-27 19:04:55 +0100 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2024-10-27 18:04:55 +0000 |
| commit | be7cfc85d08c545abfac08098b7b33b4bd71f37e (patch) | |
| tree | f4137fa2aaa57babae1f7603a9b2ed7e688f43d8 /crates/typst-library/src/loading/csv.rs | |
| parent | b8034a343831e8609aec2ec81eb7eeda57aa5d81 (diff) | |
Split out four new crates (#5302)
Diffstat (limited to 'crates/typst-library/src/loading/csv.rs')
| -rw-r--r-- | crates/typst-library/src/loading/csv.rs | 191 |
1 files changed, 191 insertions, 0 deletions
diff --git a/crates/typst-library/src/loading/csv.rs b/crates/typst-library/src/loading/csv.rs new file mode 100644 index 00000000..6822505d --- /dev/null +++ b/crates/typst-library/src/loading/csv.rs @@ -0,0 +1,191 @@ +use ecow::{eco_format, EcoString}; +use typst_syntax::Spanned; + +use crate::diag::{bail, At, SourceResult}; +use crate::engine::Engine; +use crate::foundations::{cast, func, scope, Array, Dict, IntoValue, Type, Value}; +use crate::loading::Readable; +use crate::World; + +/// Reads structured data from a CSV file. +/// +/// The CSV file will be read and parsed into a 2-dimensional array of strings: +/// Each row in the CSV file will be represented as an array of strings, and all +/// rows will be collected into a single array. Header rows will not be +/// stripped. +/// +/// # Example +/// ```example +/// #let results = csv("example.csv") +/// +/// #table( +/// columns: 2, +/// [*Condition*], [*Result*], +/// ..results.flatten(), +/// ) +/// ``` +#[func(scope, title = "CSV")] +pub fn csv( + /// The engine. + engine: &mut Engine, + /// Path to a CSV file. + /// + /// For more details, see the [Paths section]($syntax/#paths). + path: Spanned<EcoString>, + /// The delimiter that separates columns in the CSV file. + /// Must be a single ASCII character. + #[named] + #[default] + delimiter: Delimiter, + /// How to represent the file's rows. + /// + /// - If set to `array`, each row is represented as a plain array of + /// strings. + /// - If set to `dictionary`, each row is represented as a dictionary + /// mapping from header keys to strings. This option only makes sense when + /// a header row is present in the CSV file. + #[named] + #[default(RowType::Array)] + row_type: RowType, +) -> SourceResult<Array> { + let Spanned { v: path, span } = path; + let id = span.resolve_path(&path).at(span)?; + let data = engine.world.file(id).at(span)?; + self::csv::decode(Spanned::new(Readable::Bytes(data), span), delimiter, row_type) +} + +#[scope] +impl csv { + /// Reads structured data from a CSV string/bytes. + #[func(title = "Decode CSV")] + pub fn decode( + /// CSV data. + data: Spanned<Readable>, + /// The delimiter that separates columns in the CSV file. + /// Must be a single ASCII character. + #[named] + #[default] + delimiter: Delimiter, + /// How to represent the file's rows. + /// + /// - If set to `array`, each row is represented as a plain array of + /// strings. + /// - If set to `dictionary`, each row is represented as a dictionary + /// mapping from header keys to strings. This option only makes sense + /// when a header row is present in the CSV file. + #[named] + #[default(RowType::Array)] + row_type: RowType, + ) -> SourceResult<Array> { + let Spanned { v: data, span } = data; + let has_headers = row_type == RowType::Dict; + + let mut builder = ::csv::ReaderBuilder::new(); + builder.has_headers(has_headers); + builder.delimiter(delimiter.0 as u8); + + // Counting lines from 1 by default. + let mut line_offset: usize = 1; + let mut reader = builder.from_reader(data.as_slice()); + let mut headers: Option<::csv::StringRecord> = None; + + if has_headers { + // Counting lines from 2 because we have a header. + line_offset += 1; + headers = Some( + reader + .headers() + .map_err(|err| format_csv_error(err, 1)) + .at(span)? + .clone(), + ); + } + + let mut array = Array::new(); + for (line, result) in reader.records().enumerate() { + // Original solution was to use line from error, but that is + // incorrect with `has_headers` set to `false`. See issue: + // https://github.com/BurntSushi/rust-csv/issues/184 + let line = line + line_offset; + let row = result.map_err(|err| format_csv_error(err, line)).at(span)?; + let item = if let Some(headers) = &headers { + let mut dict = Dict::new(); + for (field, value) in headers.iter().zip(&row) { + dict.insert(field.into(), value.into_value()); + } + dict.into_value() + } else { + let sub = row.into_iter().map(|field| field.into_value()).collect(); + Value::Array(sub) + }; + array.push(item); + } + + Ok(array) + } +} + +/// The delimiter to use when parsing CSV files. +pub struct Delimiter(char); + +impl Default for Delimiter { + fn default() -> Self { + Self(',') + } +} + +cast! { + Delimiter, + self => self.0.into_value(), + v: EcoString => { + let mut chars = v.chars(); + let first = chars.next().ok_or("delimiter must not be empty")?; + if chars.next().is_some() { + bail!("delimiter must be a single character"); + } + + if !first.is_ascii() { + bail!("delimiter must be an ASCII character"); + } + + Self(first) + }, +} + +/// The type of parsed rows. +#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)] +pub enum RowType { + Array, + Dict, +} + +cast! { + RowType, + self => match self { + Self::Array => Type::of::<Array>(), + Self::Dict => Type::of::<Dict>(), + }.into_value(), + ty: Type => { + if ty == Type::of::<Array>() { + Self::Array + } else if ty == Type::of::<Dict>() { + Self::Dict + } else { + bail!("expected `array` or `dictionary`"); + } + }, +} + +/// Format the user-facing CSV error message. +fn format_csv_error(err: ::csv::Error, line: usize) -> EcoString { + match err.kind() { + ::csv::ErrorKind::Utf8 { .. } => "file is not valid utf-8".into(), + ::csv::ErrorKind::UnequalLengths { expected_len, len, .. } => { + eco_format!( + "failed to parse CSV (found {len} instead of \ + {expected_len} fields in line {line})" + ) + } + _ => eco_format!("failed to parse CSV ({err})"), + } +} |
