summaryrefslogtreecommitdiff
path: root/crates
diff options
context:
space:
mode:
authorLaurenz <laurmaedje@gmail.com>2023-10-28 23:35:13 +0200
committerLaurenz <laurmaedje@gmail.com>2023-10-29 00:52:15 +0200
commit29130a26f83f28ae37be1ff4f57877e765d27285 (patch)
tree5abc020ae1be4fa2681ca1b977ee7ce398b3a9bb /crates
parent4c75adbb047cba73b052c2fafa9155e2e4026610 (diff)
Linebreaking for links
Diffstat (limited to 'crates')
-rw-r--r--crates/typst-library/src/text/linebreak.rs92
-rw-r--r--crates/typst-syntax/src/lexer.rs70
-rw-r--r--crates/typst-syntax/src/lib.rs2
3 files changed, 126 insertions, 38 deletions
diff --git a/crates/typst-library/src/text/linebreak.rs b/crates/typst-library/src/text/linebreak.rs
index a026df5d..8fd48df1 100644
--- a/crates/typst-library/src/text/linebreak.rs
+++ b/crates/typst-library/src/text/linebreak.rs
@@ -5,6 +5,7 @@ use icu_provider_blob::BlobDataProvider;
use icu_segmenter::LineSegmenter;
use once_cell::sync::Lazy;
use typst::doc::Lang;
+use typst::syntax::link_prefix;
use super::TextElem;
use crate::layout::Preparation;
@@ -82,25 +83,40 @@ pub(crate) fn breakpoints<'a>(
p: &'a Preparation<'a>,
mut f: impl FnMut(usize, Breakpoint),
) {
+ let text = p.bidi.text;
+ let hyphenate = p.hyphenate != Some(false);
let lb = LINEBREAK_DATA.as_borrowed();
let segmenter = match p.lang {
Some(Lang::CHINESE | Lang::JAPANESE) => &CJ_SEGMENTER,
_ => &SEGMENTER,
};
- let hyphenate = p.hyphenate != Some(false);
let mut last = 0;
+ let mut iter = segmenter.segment_str(text).peekable();
+
+ loop {
+ // Special case for links. UAX #14 doesn't handle them well.
+ let (head, tail) = text.split_at(last);
+ if head.ends_with("://") || tail.starts_with("www.") {
+ let (link, _) = link_prefix(tail);
+ let end = last + link.len();
+ linebreak_link(link, |i| f(last + i, Breakpoint::Normal));
+ while iter.peek().map_or(false, |&p| p <= end) {
+ iter.next();
+ }
+ }
+
+ // Get the UAX #14 linebreak opportunities.
+ let Some(point) = iter.next() else { break };
- // Walk over all UAX #14 linebreak opportunities.
- for point in segmenter.segment_str(p.bidi.text) {
// Skip breakpoint if there is no char before it. icu4x generates one
// at offset 0, but we don't want it.
- let Some(c) = p.bidi.text[..point].chars().next_back() else { continue };
+ let Some(c) = text[..point].chars().next_back() else { continue };
// Find out whether the last break was mandatory by checking against
// rules LB4 and LB5, special-casing the end of text according to LB3.
// See also: https://docs.rs/icu_segmenter/latest/icu_segmenter/struct.LineSegmenter.html
- let breakpoint = if point == p.bidi.text.len() {
+ let breakpoint = if point == text.len() {
Breakpoint::Mandatory
} else {
match lb.get(c) {
@@ -121,8 +137,7 @@ pub(crate) fn breakpoints<'a>(
}
// Extract a hyphenatable "word".
- let word =
- &p.bidi.text[last..point].trim_end_matches(|c: char| !c.is_alphabetic());
+ let word = &text[last..point].trim_end_matches(|c: char| !c.is_alphabetic());
if word.is_empty() {
break 'hyphenate;
}
@@ -166,6 +181,69 @@ pub(crate) fn breakpoints<'a>(
}
}
+/// Produce linebreak opportunities for a link.
+fn linebreak_link(link: &str, mut f: impl FnMut(usize)) {
+ #[derive(PartialEq)]
+ enum Class {
+ Alphabetic,
+ Digit,
+ Open,
+ Other,
+ }
+
+ impl Class {
+ fn of(c: char) -> Self {
+ if c.is_alphabetic() {
+ Class::Alphabetic
+ } else if c.is_numeric() {
+ Class::Digit
+ } else if matches!(c, '(' | '[') {
+ Class::Open
+ } else {
+ Class::Other
+ }
+ }
+ }
+
+ let mut offset = 0;
+ let mut emit = |end: usize| {
+ let piece = &link[offset..end];
+ if piece.len() < 16 {
+ // For bearably long segments, emit them as one.
+ offset = end;
+ f(offset);
+ } else {
+ // If it gets very long (e.g. a hash in the URL), just allow a
+ // break at every char.
+ for c in piece.chars() {
+ offset += c.len_utf8();
+ f(offset);
+ }
+ }
+ };
+
+ let mut prev = Class::Other;
+ for (end, c) in link.char_indices() {
+ let class = Class::of(c);
+
+ // Emit opportunities when going from
+ // - other -> other
+ // - alphabetic -> numeric
+ // - numeric -> alphabetic
+ // Never before after opening delimiters.
+ if end > 0
+ && prev != Class::Open
+ && if class == Class::Other { prev == Class::Other } else { class != prev }
+ {
+ emit(end);
+ }
+
+ prev = class;
+ }
+
+ emit(link.len());
+}
+
/// Whether hyphenation is enabled at the given offset.
fn hyphenate_at(p: &Preparation, offset: usize) -> bool {
p.hyphenate
diff --git a/crates/typst-syntax/src/lexer.rs b/crates/typst-syntax/src/lexer.rs
index 18622154..a909dfa0 100644
--- a/crates/typst-syntax/src/lexer.rs
+++ b/crates/typst-syntax/src/lexer.rs
@@ -253,43 +253,16 @@ impl Lexer<'_> {
}
fn link(&mut self) -> SyntaxKind {
- let mut brackets = Vec::new();
-
- #[rustfmt::skip]
- self.s.eat_while(|c: char| {
- match c {
- | '0' ..= '9'
- | 'a' ..= 'z'
- | 'A' ..= 'Z'
- | '!' | '#' | '$' | '%' | '&' | '*' | '+'
- | ',' | '-' | '.' | '/' | ':' | ';' | '='
- | '?' | '@' | '_' | '~' | '\'' => true,
- '[' => {
- brackets.push(SyntaxKind::LeftBracket);
- true
- }
- '(' => {
- brackets.push(SyntaxKind::LeftParen);
- true
- }
- ']' => brackets.pop() == Some(SyntaxKind::LeftBracket),
- ')' => brackets.pop() == Some(SyntaxKind::LeftParen),
- _ => false,
- }
- });
+ let (link, balanced) = link_prefix(self.s.after());
+ self.s.jump(self.s.cursor() + link.len());
- if !brackets.is_empty() {
+ if !balanced {
return self.error(
"automatic links cannot contain unbalanced brackets, \
use the `link` function instead",
);
}
- // Don't include the trailing characters likely to be part of text.
- while matches!(self.s.scout(-1), Some('!' | ',' | '.' | ':' | ';' | '?' | '\'')) {
- self.s.uneat();
- }
-
SyntaxKind::Link
}
@@ -662,6 +635,43 @@ pub fn is_newline(character: char) -> bool {
)
}
+/// Extracts a prefix of the text that is a link and also returns whether the
+/// parentheses and brackets in the link were balanced.
+pub fn link_prefix(text: &str) -> (&str, bool) {
+ let mut s = unscanny::Scanner::new(text);
+ let mut brackets = Vec::new();
+
+ #[rustfmt::skip]
+ s.eat_while(|c: char| {
+ match c {
+ | '0' ..= '9'
+ | 'a' ..= 'z'
+ | 'A' ..= 'Z'
+ | '!' | '#' | '$' | '%' | '&' | '*' | '+'
+ | ',' | '-' | '.' | '/' | ':' | ';' | '='
+ | '?' | '@' | '_' | '~' | '\'' => true,
+ '[' => {
+ brackets.push(b'[');
+ true
+ }
+ '(' => {
+ brackets.push(b'(');
+ true
+ }
+ ']' => brackets.pop() == Some(b'['),
+ ')' => brackets.pop() == Some(b'('),
+ _ => false,
+ }
+ });
+
+ // Don't include the trailing characters likely to be part of text.
+ while matches!(s.scout(-1), Some('!' | ',' | '.' | ':' | ';' | '?' | '\'')) {
+ s.uneat();
+ }
+
+ (s.before(), brackets.is_empty())
+}
+
/// Split text at newlines.
pub(super) fn split_newlines(text: &str) -> Vec<&str> {
let mut s = Scanner::new(text);
diff --git a/crates/typst-syntax/src/lib.rs b/crates/typst-syntax/src/lib.rs
index 4ee37096..5cf740e7 100644
--- a/crates/typst-syntax/src/lib.rs
+++ b/crates/typst-syntax/src/lib.rs
@@ -15,7 +15,7 @@ mod span;
pub use self::file::{FileId, PackageSpec, PackageVersion, VirtualPath};
pub use self::highlight::{highlight, highlight_html, Tag};
pub use self::kind::SyntaxKind;
-pub use self::lexer::{is_id_continue, is_id_start, is_ident, is_newline};
+pub use self::lexer::{is_id_continue, is_id_start, is_ident, is_newline, link_prefix};
pub use self::node::{LinkedChildren, LinkedNode, SyntaxError, SyntaxNode};
pub use self::parser::{parse, parse_code, parse_math};
pub use self::source::Source;