summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLaurenz <laurmaedje@gmail.com>2024-10-03 15:00:58 +0200
committerGitHub <noreply@github.com>2024-10-03 13:00:58 +0000
commit0343e038d373b9164679c4d7b2c5345c58c123b7 (patch)
tree301bafe4e0e37da576a837e466eebb47e1d5ff15
parentd86789c1f7057e1cf5cfb02487febc5e1c4cf8c9 (diff)
Fix default ignorables (#5099)
-rw-r--r--crates/typst-pdf/src/content.rs16
-rw-r--r--crates/typst/src/layout/inline/collect.rs3
-rw-r--r--crates/typst/src/layout/inline/linebreak.rs17
-rw-r--r--crates/typst/src/layout/inline/mod.rs2
-rw-r--r--crates/typst/src/layout/inline/shaping.rs19
-rw-r--r--crates/typst/src/text/item.rs44
-rw-r--r--crates/typst/src/text/mod.rs18
7 files changed, 62 insertions, 57 deletions
diff --git a/crates/typst-pdf/src/content.rs b/crates/typst-pdf/src/content.rs
index aa20e55a..4f2baafb 100644
--- a/crates/typst-pdf/src/content.rs
+++ b/crates/typst-pdf/src/content.rs
@@ -425,7 +425,7 @@ fn write_text(ctx: &mut Builder, pos: Point, text: &TextItem) -> SourceResult<()
bail!(
g.span.0,
"the text {} could not be displayed with any font",
- text.text[g.range()].repr()
+ TextItemView::full(text).glyph_text(g).repr(),
);
}
}
@@ -441,7 +441,7 @@ fn write_text(ctx: &mut Builder, pos: Point, text: &TextItem) -> SourceResult<()
|| tables.svg.is_some()
|| tables.colr.is_some();
if !has_color_glyphs {
- write_normal_text(ctx, pos, TextItemView::all_of(text))?;
+ write_normal_text(ctx, pos, TextItemView::full(text))?;
return Ok(());
}
@@ -449,9 +449,9 @@ fn write_text(ctx: &mut Builder, pos: Point, text: &TextItem) -> SourceResult<()
text.glyphs.iter().filter(|g| is_color_glyph(&text.font, g)).count();
if color_glyph_count == text.glyphs.len() {
- write_color_glyphs(ctx, pos, TextItemView::all_of(text))?;
+ write_color_glyphs(ctx, pos, TextItemView::full(text))?;
} else if color_glyph_count == 0 {
- write_normal_text(ctx, pos, TextItemView::all_of(text))?;
+ write_normal_text(ctx, pos, TextItemView::full(text))?;
} else {
// Otherwise we need to split it in smaller text runs
let mut offset = 0;
@@ -493,9 +493,7 @@ fn write_normal_text(
let glyph_set = ctx.resources.glyph_sets.entry(text.item.font.clone()).or_default();
for g in text.glyphs() {
- let t = text.text();
- let segment = &t[g.range()];
- glyph_set.entry(g.id).or_insert_with(|| segment.into());
+ glyph_set.entry(g.id).or_insert_with(|| text.glyph_text(&g));
}
let fill_transform = ctx.state.transforms(Size::zero(), pos);
@@ -640,9 +638,7 @@ fn write_color_glyphs(
ctx.content.show(Str(&[index]));
- glyph_set
- .entry(glyph.id)
- .or_insert_with(|| text.text()[glyph.range()].into());
+ glyph_set.entry(glyph.id).or_insert_with(|| text.glyph_text(&glyph));
}
ctx.content.end_text();
diff --git a/crates/typst/src/layout/inline/collect.rs b/crates/typst/src/layout/inline/collect.rs
index f1bb5869..af14b152 100644
--- a/crates/typst/src/layout/inline/collect.rs
+++ b/crates/typst/src/layout/inline/collect.rs
@@ -8,7 +8,8 @@ use crate::layout::{
};
use crate::syntax::Span;
use crate::text::{
- LinebreakElem, SmartQuoteElem, SmartQuoter, SmartQuotes, SpaceElem, TextElem,
+ is_default_ignorable, LinebreakElem, SmartQuoteElem, SmartQuoter, SmartQuotes,
+ SpaceElem, TextElem,
};
use crate::utils::Numeric;
diff --git a/crates/typst/src/layout/inline/linebreak.rs b/crates/typst/src/layout/inline/linebreak.rs
index b72507fd..f5048aae 100644
--- a/crates/typst/src/layout/inline/linebreak.rs
+++ b/crates/typst/src/layout/inline/linebreak.rs
@@ -2,7 +2,6 @@ use std::ops::{Add, Sub};
use az::SaturatingAs;
use icu_properties::maps::{CodePointMapData, CodePointMapDataBorrowed};
-use icu_properties::sets::CodePointSetData;
use icu_properties::LineBreak;
use icu_provider::AsDeserializingBufferProvider;
use icu_provider_adapters::fork::ForkByKeyProvider;
@@ -16,7 +15,7 @@ use crate::engine::Engine;
use crate::layout::{Abs, Em};
use crate::model::Linebreaks;
use crate::syntax::link_prefix;
-use crate::text::{Lang, TextElem};
+use crate::text::{is_default_ignorable, Lang, TextElem};
/// The cost of a line or paragraph layout.
type Cost = f64;
@@ -58,12 +57,6 @@ static LINEBREAK_DATA: Lazy<CodePointMapData<LineBreak>> = Lazy::new(|| {
icu_properties::maps::load_line_break(&blob().as_deserializing()).unwrap()
});
-/// The set of Unicode default ignorables.
-static DEFAULT_IGNORABLE_DATA: Lazy<CodePointSetData> = Lazy::new(|| {
- icu_properties::sets::load_default_ignorable_code_point(&blob().as_deserializing())
- .unwrap()
-});
-
/// A line break opportunity.
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
pub enum Breakpoint {
@@ -80,8 +73,7 @@ impl Breakpoint {
/// Trim a line before this breakpoint.
pub fn trim(self, line: &str) -> &str {
// Trim default ignorables.
- let ignorable = DEFAULT_IGNORABLE_DATA.as_borrowed();
- let line = line.trim_end_matches(|c| ignorable.contains(c));
+ let line = line.trim_end_matches(is_default_ignorable);
match self {
// Trim whitespace.
@@ -986,8 +978,3 @@ where
}
}
}
-
-/// Whether a codepoint is Unicode `Default_Ignorable`.
-pub fn is_default_ignorable(c: char) -> bool {
- DEFAULT_IGNORABLE_DATA.as_borrowed().contains(c)
-}
diff --git a/crates/typst/src/layout/inline/mod.rs b/crates/typst/src/layout/inline/mod.rs
index 889a028d..275cb332 100644
--- a/crates/typst/src/layout/inline/mod.rs
+++ b/crates/typst/src/layout/inline/mod.rs
@@ -10,7 +10,7 @@ use comemo::{Track, Tracked, TrackedMut};
use self::collect::{collect, Item, Segment, SpanMapper};
use self::finalize::finalize;
use self::line::{commit, line, Line};
-use self::linebreak::{is_default_ignorable, linebreak, Breakpoint};
+use self::linebreak::{linebreak, Breakpoint};
use self::prepare::{prepare, Preparation};
use self::shaping::{
cjk_punct_style, is_of_cj_script, shape_range, ShapedGlyph, ShapedText,
diff --git a/crates/typst/src/layout/inline/shaping.rs b/crates/typst/src/layout/inline/shaping.rs
index 4344192c..9d83a9a0 100644
--- a/crates/typst/src/layout/inline/shaping.rs
+++ b/crates/typst/src/layout/inline/shaping.rs
@@ -5,7 +5,7 @@ use std::sync::Arc;
use az::SaturatingAs;
use ecow::EcoString;
-use rustybuzz::{ShapePlan, UnicodeBuffer};
+use rustybuzz::{BufferFlags, ShapePlan, UnicodeBuffer};
use ttf_parser::Tag;
use unicode_bidi::{BidiInfo, Level as BidiLevel};
use unicode_script::{Script, UnicodeScript};
@@ -15,8 +15,8 @@ use crate::engine::Engine;
use crate::foundations::{Smart, StyleChain};
use crate::layout::{Abs, Dir, Em, Frame, FrameItem, Point, Size};
use crate::text::{
- decorate, families, features, variant, Font, FontVariant, Glyph, Lang, Region,
- TextElem, TextItem,
+ decorate, families, features, is_default_ignorable, variant, Font, FontVariant,
+ Glyph, Lang, Region, TextElem, TextItem,
};
use crate::utils::SliceExt;
use crate::World;
@@ -725,8 +725,11 @@ fn shape_segment<'a>(
text: &str,
mut families: impl Iterator<Item = &'a str> + Clone,
) {
- // Fonts dont have newlines and tabs.
- if text.chars().all(|c| c == '\n' || c == '\t') {
+ // Don't try shaping newlines, tabs, or default ignorables.
+ if text
+ .chars()
+ .all(|c| c == '\n' || c == '\t' || is_default_ignorable(c))
+ {
return;
}
@@ -774,6 +777,12 @@ fn shape_segment<'a>(
});
buffer.guess_segment_properties();
+ // By default, Harfbuzz will create zero-width space glyphs for default
+ // ignorables. This is probably useful for GUI apps that want noticable
+ // effects on the cursor for those, but for us it's not useful and hurts
+ // text extraction.
+ buffer.set_flags(BufferFlags::REMOVE_DEFAULT_IGNORABLES);
+
// Prepare the shape plan. This plan depends on direction, script, language,
// and features, but is independent from the text and can thus be memoized.
let plan = create_shape_plan(
diff --git a/crates/typst/src/text/item.rs b/crates/typst/src/text/item.rs
index 5de30a8d..f6044b2f 100644
--- a/crates/typst/src/text/item.rs
+++ b/crates/typst/src/text/item.rs
@@ -5,7 +5,7 @@ use ecow::EcoString;
use crate::layout::{Abs, Em};
use crate::syntax::Span;
-use crate::text::{Font, Lang, Region};
+use crate::text::{is_default_ignorable, Font, Lang, Region};
use crate::visualize::{FixedStroke, Paint};
/// A run of shaped text.
@@ -78,7 +78,7 @@ pub struct TextItemView<'a> {
impl<'a> TextItemView<'a> {
/// Build a TextItemView for the whole contents of a TextItem.
- pub fn all_of(text: &'a TextItem) -> Self {
+ pub fn full(text: &'a TextItem) -> Self {
Self::from_glyph_range(text, 0..text.glyphs.len())
}
@@ -87,28 +87,30 @@ impl<'a> TextItemView<'a> {
TextItemView { item: text, glyph_range }
}
- /// Obtains a glyph in this slice, remapping the range that it represents in
- /// the original text so that it is relative to the start of the slice
- pub fn glyph_at(&self, index: usize) -> Glyph {
- let g = &self.item.glyphs[self.glyph_range.start + index];
- let base = self.text_range().start as u16;
- Glyph {
- range: g.range.start - base..g.range.end - base,
- ..*g
- }
- }
-
/// Returns an iterator over the glyphs of the slice.
///
/// The range of text that each glyph represents is remapped to be relative
/// to the start of the slice.
pub fn glyphs(&self) -> impl Iterator<Item = Glyph> + '_ {
- (0..self.glyph_range.len()).map(|index| self.glyph_at(index))
+ let first = self.item.glyphs[self.glyph_range.start].range();
+ let last = self.item.glyphs[self.glyph_range.end - 1].range();
+ let base = first.start.min(last.start) as u16;
+ (0..self.glyph_range.len()).map(move |index| {
+ let g = &self.item.glyphs[self.glyph_range.start + index];
+ Glyph {
+ range: g.range.start - base..g.range.end - base,
+ ..*g
+ }
+ })
}
- /// The plain text that this slice represents
- pub fn text(&self) -> &str {
- &self.item.text[self.text_range()]
+ /// The plain text for the given glyph. This is an approximation since
+ /// glyphs do not correspond 1-1 with codepoints.
+ pub fn glyph_text(&self, glyph: &Glyph) -> EcoString {
+ self.item.text[glyph.range()]
+ .chars()
+ .filter(|&c| !is_default_ignorable(c))
+ .collect()
}
/// The total width of this text slice
@@ -119,12 +121,4 @@ impl<'a> TextItemView<'a> {
.sum::<Em>()
.at(self.item.size)
}
-
- /// The range of text in the original TextItem that this slice corresponds
- /// to.
- fn text_range(&self) -> Range<usize> {
- let first = self.item.glyphs[self.glyph_range.start].range();
- let last = self.item.glyphs[self.glyph_range.end - 1].range();
- first.start.min(last.start)..first.end.max(last.end)
- }
}
diff --git a/crates/typst/src/text/mod.rs b/crates/typst/src/text/mod.rs
index fbd5b968..2e53b11a 100644
--- a/crates/typst/src/text/mod.rs
+++ b/crates/typst/src/text/mod.rs
@@ -31,6 +31,10 @@ pub use self::space::*;
use std::fmt::{self, Debug, Formatter};
use ecow::{eco_format, EcoString};
+use icu_properties::sets::CodePointSetData;
+use icu_provider::AsDeserializingBufferProvider;
+use icu_provider_blob::BlobDataProvider;
+use once_cell::sync::Lazy;
use rustybuzz::Feature;
use smallvec::SmallVec;
use ttf_parser::{Rect, Tag};
@@ -1310,6 +1314,20 @@ cast! {
},
}
+/// Whether a codepoint is Unicode `Default_Ignorable`.
+pub(crate) fn is_default_ignorable(c: char) -> bool {
+ /// The set of Unicode default ignorables.
+ static DEFAULT_IGNORABLE_DATA: Lazy<CodePointSetData> = Lazy::new(|| {
+ icu_properties::sets::load_default_ignorable_code_point(
+ &BlobDataProvider::try_new_from_static_blob(typst_assets::icu::ICU)
+ .unwrap()
+ .as_deserializing(),
+ )
+ .unwrap()
+ });
+ DEFAULT_IGNORABLE_DATA.as_borrowed().contains(c)
+}
+
/// Pushes `text` wrapped in LRE/RLE + PDF to `out`.
pub(crate) fn isolate(text: Content, styles: StyleChain, out: &mut Vec<Content>) {
out.push(TextElem::packed(match TextElem::dir_in(styles) {