Linebreaking for links

author: Laurenz <laurmaedje@gmail.com> 2023-10-28 23:35:13 +0200
committer: Laurenz <laurmaedje@gmail.com> 2023-10-29 00:52:15 +0200
commit: 29130a26f83f28ae37be1ff4f57877e765d27285 (patch)
tree: 5abc020ae1be4fa2681ca1b977ee7ce398b3a9bb /crates
parent: 4c75adbb047cba73b052c2fafa9155e2e4026610 (diff)
3 files changed, 126 insertions, 38 deletions
diff --git a/crates/typst-library/src/text/linebreak.rs b/crates/typst-library/src/text/linebreak.rs
index a026df5d..8fd48df1 100644
--- a/crates/typst-library/src/text/linebreak.rs
+++ b/crates/typst-library/src/text/linebreak.rs
@@ -5,6 +5,7 @@ use icu_provider_blob::BlobDataProvider;
 use icu_segmenter::LineSegmenter;
 use once_cell::sync::Lazy;
 use typst::doc::Lang;
+use typst::syntax::link_prefix;
 
 use super::TextElem;
 use crate::layout::Preparation;
@@ -82,25 +83,40 @@ pub(crate) fn breakpoints<'a>(
     p: &'a Preparation<'a>,
     mut f: impl FnMut(usize, Breakpoint),
 ) {
+    let text = p.bidi.text;
+    let hyphenate = p.hyphenate != Some(false);
     let lb = LINEBREAK_DATA.as_borrowed();
     let segmenter = match p.lang {
         Some(Lang::CHINESE | Lang::JAPANESE) => &CJ_SEGMENTER,
         _ => &SEGMENTER,
     };
 
-    let hyphenate = p.hyphenate != Some(false);
     let mut last = 0;
+    let mut iter = segmenter.segment_str(text).peekable();
+
+    loop {
+        // Special case for links. UAX #14 doesn't handle them well.
+        let (head, tail) = text.split_at(last);
+        if head.ends_with("://") || tail.starts_with("www.") {
+            let (link, _) = link_prefix(tail);
+            let end = last + link.len();
+            linebreak_link(link, |i| f(last + i, Breakpoint::Normal));
+            while iter.peek().map_or(false, |&p| p <= end) {
+                iter.next();
+            }
+        }
+
+        // Get the UAX #14 linebreak opportunities.
+        let Some(point) = iter.next() else { break };
 
-    // Walk over all UAX #14 linebreak opportunities.
-    for point in segmenter.segment_str(p.bidi.text) {
         // Skip breakpoint if there is no char before it. icu4x generates one
         // at offset 0, but we don't want it.
-        let Some(c) = p.bidi.text[..point].chars().next_back() else { continue };
+        let Some(c) = text[..point].chars().next_back() else { continue };
 
         // Find out whether the last break was mandatory by checking against
         // rules LB4 and LB5, special-casing the end of text according to LB3.
         // See also: https://docs.rs/icu_segmenter/latest/icu_segmenter/struct.LineSegmenter.html
-        let breakpoint = if point == p.bidi.text.len() {
+        let breakpoint = if point == text.len() {
             Breakpoint::Mandatory
         } else {
             match lb.get(c) {
@@ -121,8 +137,7 @@ pub(crate) fn breakpoints<'a>(
             }
 
             // Extract a hyphenatable "word".
-            let word =
-                &p.bidi.text[last..point].trim_end_matches(|c: char| !c.is_alphabetic());
+            let word = &text[last..point].trim_end_matches(|c: char| !c.is_alphabetic());
             if word.is_empty() {
                 break 'hyphenate;
             }
@@ -166,6 +181,69 @@ pub(crate) fn breakpoints<'a>(
     }
 }
 
+/// Produce linebreak opportunities for a link.
+fn linebreak_link(link: &str, mut f: impl FnMut(usize)) {
+    #[derive(PartialEq)]
+    enum Class {
+        Alphabetic,
+        Digit,
+        Open,
+        Other,
+    }
+
+    impl Class {
+        fn of(c: char) -> Self {
+            if c.is_alphabetic() {
+                Class::Alphabetic
+            } else if c.is_numeric() {
+                Class::Digit
+            } else if matches!(c, '(' | '[') {
+                Class::Open
+            } else {
+                Class::Other
+            }
+        }
+    }
+
+    let mut offset = 0;
+    let mut emit = |end: usize| {
+        let piece = &link[offset..end];
+        if piece.len() < 16 {
+            // For bearably long segments, emit them as one.
+            offset = end;
+            f(offset);
+        } else {
+            // If it gets very long (e.g. a hash in the URL), just allow a
+            // break at every char.
+            for c in piece.chars() {
+                offset += c.len_utf8();
+                f(offset);
+            }
+        }
+    };
+
+    let mut prev = Class::Other;
+    for (end, c) in link.char_indices() {
+        let class = Class::of(c);
+
+        // Emit opportunities when going from
+        // - other -> other
+        // - alphabetic -> numeric
+        // - numeric -> alphabetic
+        // Never before after opening delimiters.
+        if end > 0
+            && prev != Class::Open
+            && if class == Class::Other { prev == Class::Other } else { class != prev }
+        {
+            emit(end);
+        }
+
+        prev = class;
+    }
+
+    emit(link.len());
+}
+
 /// Whether hyphenation is enabled at the given offset.
 fn hyphenate_at(p: &Preparation, offset: usize) -> bool {
     p.hyphenate
diff --git a/crates/typst-syntax/src/lexer.rs b/crates/typst-syntax/src/lexer.rs
index 18622154..a909dfa0 100644
--- a/crates/typst-syntax/src/lexer.rs
+++ b/crates/typst-syntax/src/lexer.rs
@@ -253,43 +253,16 @@ impl Lexer<'_> {
     }
 
     fn link(&mut self) -> SyntaxKind {
-        let mut brackets = Vec::new();
-
-        #[rustfmt::skip]
-        self.s.eat_while(|c: char| {
-            match c {
-                | '0' ..= '9'
-                | 'a' ..= 'z'
-                | 'A' ..= 'Z'
-                | '!' | '#' | '$' | '%' | '&' | '*' | '+'
-                | ',' | '-' | '.' | '/' | ':' | ';' | '='
-                | '?' | '@' | '_' | '~' | '\'' => true,
-                '[' => {
-                    brackets.push(SyntaxKind::LeftBracket);
-                    true
-                }
-                '(' => {
-                    brackets.push(SyntaxKind::LeftParen);
-                    true
-                }
-                ']' => brackets.pop() == Some(SyntaxKind::LeftBracket),
-                ')' => brackets.pop() == Some(SyntaxKind::LeftParen),
-                _ => false,
-            }
-        });
+        let (link, balanced) = link_prefix(self.s.after());
+        self.s.jump(self.s.cursor() + link.len());
 
-        if !brackets.is_empty() {
+        if !balanced {
             return self.error(
                 "automatic links cannot contain unbalanced brackets, \
                  use the `link` function instead",
             );
         }
 
-        // Don't include the trailing characters likely to be part of text.
-        while matches!(self.s.scout(-1), Some('!' | ',' | '.' | ':' | ';' | '?' | '\'')) {
-            self.s.uneat();
-        }
-
         SyntaxKind::Link
     }
 
@@ -662,6 +635,43 @@ pub fn is_newline(character: char) -> bool {
     )
 }
 
+/// Extracts a prefix of the text that is a link and also returns whether the
+/// parentheses and brackets in the link were balanced.
+pub fn link_prefix(text: &str) -> (&str, bool) {
+    let mut s = unscanny::Scanner::new(text);
+    let mut brackets = Vec::new();
+
+    #[rustfmt::skip]
+    s.eat_while(|c: char| {
+        match c {
+            | '0' ..= '9'
+            | 'a' ..= 'z'
+            | 'A' ..= 'Z'
+            | '!' | '#' | '$' | '%' | '&' | '*' | '+'
+            | ',' | '-' | '.' | '/' | ':' | ';' | '='
+            | '?' | '@' | '_' | '~' | '\'' => true,
+            '[' => {
+                brackets.push(b'[');
+                true
+            }
+            '(' => {
+                brackets.push(b'(');
+                true
+            }
+            ']' => brackets.pop() == Some(b'['),
+            ')' => brackets.pop() == Some(b'('),
+            _ => false,
+        }
+    });
+
+    // Don't include the trailing characters likely to be part of text.
+    while matches!(s.scout(-1), Some('!' | ',' | '.' | ':' | ';' | '?' | '\'')) {
+        s.uneat();
+    }
+
+    (s.before(), brackets.is_empty())
+}
+
 /// Split text at newlines.
 pub(super) fn split_newlines(text: &str) -> Vec<&str> {
     let mut s = Scanner::new(text);
diff --git a/crates/typst-syntax/src/lib.rs b/crates/typst-syntax/src/lib.rs
index 4ee37096..5cf740e7 100644
--- a/crates/typst-syntax/src/lib.rs
+++ b/crates/typst-syntax/src/lib.rs
@@ -15,7 +15,7 @@ mod span;
 pub use self::file::{FileId, PackageSpec, PackageVersion, VirtualPath};
 pub use self::highlight::{highlight, highlight_html, Tag};
 pub use self::kind::SyntaxKind;
-pub use self::lexer::{is_id_continue, is_id_start, is_ident, is_newline};
+pub use self::lexer::{is_id_continue, is_id_start, is_ident, is_newline, link_prefix};
 pub use self::node::{LinkedChildren, LinkedNode, SyntaxError, SyntaxNode};
 pub use self::parser::{parse, parse_code, parse_math};
 pub use self::source::Source;
author	Laurenz <laurmaedje@gmail.com>	2023-10-28 23:35:13 +0200
committer	Laurenz <laurmaedje@gmail.com>	2023-10-29 00:52:15 +0200
commit	29130a26f83f28ae37be1ff4f57877e765d27285 (patch)
tree	5abc020ae1be4fa2681ca1b977ee7ce398b3a9bb /crates
parent	4c75adbb047cba73b052c2fafa9155e2e4026610 (diff)