1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
|
use icu_properties::{maps::CodePointMapData, LineBreak};
use icu_provider::AsDeserializingBufferProvider;
use icu_provider_adapters::fork::ForkByKeyProvider;
use icu_provider_blob::BlobDataProvider;
use icu_segmenter::LineSegmenter;
use once_cell::sync::Lazy;
use typst::doc::Lang;
use typst::syntax::link_prefix;
use super::TextElem;
use crate::layout::Preparation;
/// Generated by the following command:
///
/// ```sh
/// icu4x-datagen --locales full \
/// --format blob \
/// --keys-for-bin target/debug/typst \
/// --out crates/typst-library/assets/icudata.postcard \
/// --overwrite
/// ```
///
/// Install icu_datagen with `cargo install icu_datagen`.
static ICU_DATA: &[u8] = include_bytes!("../../assets/icudata.postcard");
/// Generated by the following command:
///
/// ```sh
/// icu4x-datagen --locales zh ja \
/// --format blob \
/// --keys segmenter/line@1 \
/// --out crates/typst-library/assets/cj_linebreak_data.postcard \
/// --overwrite
/// ```
///
/// The used icu_datagen should be patched by
/// https://github.com/peng1999/icu4x/commit/b9beb6cbf633d61fc3d7983e5baf7f4449fbfae5
static CJ_LINEBREAK_DATA: &[u8] =
include_bytes!("../../assets/cj_linebreak_data.postcard");
/// The general line break segmenter.
static SEGMENTER: Lazy<LineSegmenter> = Lazy::new(|| {
let provider = BlobDataProvider::try_new_from_static_blob(ICU_DATA).unwrap();
LineSegmenter::try_new_lstm_with_buffer_provider(&provider).unwrap()
});
/// The line break segmenter for Chinese/Japanese text.
static CJ_SEGMENTER: Lazy<LineSegmenter> = Lazy::new(|| {
let provider = BlobDataProvider::try_new_from_static_blob(ICU_DATA).unwrap();
let cj_blob = BlobDataProvider::try_new_from_static_blob(CJ_LINEBREAK_DATA).unwrap();
let cj_provider = ForkByKeyProvider::new(cj_blob, provider);
LineSegmenter::try_new_lstm_with_buffer_provider(&cj_provider).unwrap()
});
/// The Unicode line break properties for each code point.
static LINEBREAK_DATA: Lazy<CodePointMapData<LineBreak>> = Lazy::new(|| {
let provider = BlobDataProvider::try_new_from_static_blob(ICU_DATA).unwrap();
let deser_provider = provider.as_deserializing();
icu_properties::maps::load_line_break(&deser_provider).unwrap()
});
/// A line break opportunity.
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
pub(crate) enum Breakpoint {
/// Just a normal opportunity (e.g. after a space).
Normal,
/// A mandatory breakpoint (after '\n' or at the end of the text).
Mandatory,
/// An opportunity for hyphenating.
Hyphen,
}
/// Calls `f` for all possible points in the text where lines can broken.
///
/// Yields for each breakpoint the text index, whether the break is mandatory
/// (after `\n`) and whether a hyphen is required (when breaking inside of a
/// word).
///
/// This is an internal instead of an external iterator because it makes the
/// code much simpler and the consumers of this function don't need the
/// composability and flexibility of external iteration anyway.
pub(crate) fn breakpoints<'a>(
p: &'a Preparation<'a>,
mut f: impl FnMut(usize, Breakpoint),
) {
let text = p.bidi.text;
let hyphenate = p.hyphenate != Some(false);
let lb = LINEBREAK_DATA.as_borrowed();
let segmenter = match p.lang {
Some(Lang::CHINESE | Lang::JAPANESE) => &CJ_SEGMENTER,
_ => &SEGMENTER,
};
let mut last = 0;
let mut iter = segmenter.segment_str(text).peekable();
loop {
// Special case for links. UAX #14 doesn't handle them well.
let (head, tail) = text.split_at(last);
if head.ends_with("://") || tail.starts_with("www.") {
let (link, _) = link_prefix(tail);
let end = last + link.len();
linebreak_link(link, |i| f(last + i, Breakpoint::Normal));
while iter.peek().map_or(false, |&p| p <= end) {
iter.next();
}
}
// Get the UAX #14 linebreak opportunities.
let Some(point) = iter.next() else { break };
// Skip breakpoint if there is no char before it. icu4x generates one
// at offset 0, but we don't want it.
let Some(c) = text[..point].chars().next_back() else { continue };
// Find out whether the last break was mandatory by checking against
// rules LB4 and LB5, special-casing the end of text according to LB3.
// See also: https://docs.rs/icu_segmenter/latest/icu_segmenter/struct.LineSegmenter.html
let breakpoint = if point == text.len() {
Breakpoint::Mandatory
} else {
match lb.get(c) {
// Fix for: https://github.com/unicode-org/icu4x/issues/4146
LineBreak::Glue | LineBreak::WordJoiner | LineBreak::ZWJ => continue,
LineBreak::MandatoryBreak
| LineBreak::CarriageReturn
| LineBreak::LineFeed
| LineBreak::NextLine => Breakpoint::Mandatory,
_ => Breakpoint::Normal,
}
};
// Hyphenate between the last and current breakpoint.
'hyphenate: {
if !hyphenate {
break 'hyphenate;
}
// Extract a hyphenatable "word".
let word = &text[last..point].trim_end_matches(|c: char| !c.is_alphabetic());
if word.is_empty() {
break 'hyphenate;
}
let end = last + word.len();
let mut offset = last;
// Determine the language to hyphenate this word in.
let Some(lang) = lang_at(p, last) else { break 'hyphenate };
for syllable in hypher::hyphenate(word, lang) {
// Don't hyphenate after the final syllable.
offset += syllable.len();
if offset == end {
continue;
}
// Filter out hyphenation opportunities where hyphenation was
// actually disabled.
if !hyphenate_at(p, offset) {
continue;
}
// Filter out forbidden hyphenation opportunities.
if matches!(
syllable.chars().next_back().map(|c| lb.get(c)),
Some(LineBreak::Glue | LineBreak::WordJoiner | LineBreak::ZWJ)
) {
continue;
}
// Call `f` for the word-internal hyphenation opportunity.
f(offset, Breakpoint::Hyphen);
}
}
// Call `f` for the UAX #14 break opportunity.
f(point, breakpoint);
last = point;
}
}
/// Produce linebreak opportunities for a link.
fn linebreak_link(link: &str, mut f: impl FnMut(usize)) {
#[derive(PartialEq)]
enum Class {
Alphabetic,
Digit,
Open,
Other,
}
impl Class {
fn of(c: char) -> Self {
if c.is_alphabetic() {
Class::Alphabetic
} else if c.is_numeric() {
Class::Digit
} else if matches!(c, '(' | '[') {
Class::Open
} else {
Class::Other
}
}
}
let mut offset = 0;
let mut emit = |end: usize| {
let piece = &link[offset..end];
if piece.len() < 16 {
// For bearably long segments, emit them as one.
offset = end;
f(offset);
} else {
// If it gets very long (e.g. a hash in the URL), just allow a
// break at every char.
for c in piece.chars() {
offset += c.len_utf8();
f(offset);
}
}
};
let mut prev = Class::Other;
for (end, c) in link.char_indices() {
let class = Class::of(c);
// Emit opportunities when going from
// - other -> other
// - alphabetic -> numeric
// - numeric -> alphabetic
// Never before after opening delimiters.
if end > 0
&& prev != Class::Open
&& if class == Class::Other { prev == Class::Other } else { class != prev }
{
emit(end);
}
prev = class;
}
emit(link.len());
}
/// Whether hyphenation is enabled at the given offset.
fn hyphenate_at(p: &Preparation, offset: usize) -> bool {
p.hyphenate
.or_else(|| {
let shaped = p.find(offset)?.text()?;
Some(TextElem::hyphenate_in(shaped.styles))
})
.unwrap_or(false)
}
/// The text language at the given offset.
fn lang_at(p: &Preparation, offset: usize) -> Option<hypher::Lang> {
let lang = p.lang.or_else(|| {
let shaped = p.find(offset)?.text()?;
Some(TextElem::lang_in(shaped.styles))
})?;
let bytes = lang.as_str().as_bytes().try_into().ok()?;
hypher::Lang::from_iso(bytes)
}
|