1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
|
use std::ops::{Add, Sub};
use std::sync::LazyLock;
use az::SaturatingAs;
use icu_properties::maps::{CodePointMapData, CodePointMapDataBorrowed};
use icu_properties::LineBreak;
use icu_provider::AsDeserializingBufferProvider;
use icu_provider_adapters::fork::ForkByKeyProvider;
use icu_provider_blob::BlobDataProvider;
use icu_segmenter::LineSegmenter;
use typst_library::engine::Engine;
use typst_library::layout::{Abs, Em};
use typst_library::model::Linebreaks;
use typst_library::text::{is_default_ignorable, Lang, TextElem};
use typst_syntax::link_prefix;
use unicode_segmentation::UnicodeSegmentation;
use super::*;
/// The cost of a line or inline layout.
type Cost = f64;
// Cost parameters.
//
// We choose higher costs than the Knuth-Plass paper (which would be 50) because
// it hyphenates way to eagerly in Typst otherwise. Could be related to the
// ratios coming out differently since Typst doesn't have the concept of glue,
// so things work a bit differently.
const DEFAULT_HYPH_COST: Cost = 135.0;
const DEFAULT_RUNT_COST: Cost = 100.0;
// Other parameters.
const MIN_RATIO: f64 = -1.0;
const MIN_APPROX_RATIO: f64 = -0.5;
const BOUND_EPS: f64 = 1e-3;
/// The ICU blob data.
fn blob() -> BlobDataProvider {
BlobDataProvider::try_new_from_static_blob(typst_assets::icu::ICU).unwrap()
}
/// The general line break segmenter.
static SEGMENTER: LazyLock<LineSegmenter> =
LazyLock::new(|| LineSegmenter::try_new_lstm_with_buffer_provider(&blob()).unwrap());
/// The line break segmenter for Chinese/Japanese text.
static CJ_SEGMENTER: LazyLock<LineSegmenter> = LazyLock::new(|| {
let cj_blob =
BlobDataProvider::try_new_from_static_blob(typst_assets::icu::ICU_CJ_SEGMENT)
.unwrap();
let cj_provider = ForkByKeyProvider::new(cj_blob, blob());
LineSegmenter::try_new_lstm_with_buffer_provider(&cj_provider).unwrap()
});
/// The Unicode line break properties for each code point.
static LINEBREAK_DATA: LazyLock<CodePointMapData<LineBreak>> = LazyLock::new(|| {
icu_properties::maps::load_line_break(&blob().as_deserializing()).unwrap()
});
/// A line break opportunity.
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
pub enum Breakpoint {
/// Just a normal opportunity (e.g. after a space).
Normal,
/// A mandatory breakpoint (after '\n' or at the end of the text).
Mandatory,
/// An opportunity for hyphenating and how many chars are before/after it
/// in the word.
Hyphen(u8, u8),
}
impl Breakpoint {
/// Trim a line before this breakpoint.
pub fn trim(self, line: &str) -> &str {
// Trim default ignorables.
let line = line.trim_end_matches(is_default_ignorable);
match self {
// Trim whitespace.
Self::Normal => line.trim_end_matches(char::is_whitespace),
// Trim linebreaks.
Self::Mandatory => {
let lb = LINEBREAK_DATA.as_borrowed();
line.trim_end_matches(|c| {
matches!(
lb.get(c),
LineBreak::MandatoryBreak
| LineBreak::CarriageReturn
| LineBreak::LineFeed
| LineBreak::NextLine
)
})
}
// Trim nothing further.
Self::Hyphen(..) => line,
}
}
/// Whether this is a hyphen breakpoint.
pub fn is_hyphen(self) -> bool {
matches!(self, Self::Hyphen(..))
}
}
/// Breaks the text into lines.
pub fn linebreak<'a>(
engine: &Engine,
p: &'a Preparation<'a>,
width: Abs,
) -> Vec<Line<'a>> {
match p.config.linebreaks {
Linebreaks::Simple => linebreak_simple(engine, p, width),
Linebreaks::Optimized => linebreak_optimized(engine, p, width),
}
}
/// Performs line breaking in simple first-fit style. This means that we build
/// lines greedily, always taking the longest possible line. This may lead to
/// very unbalanced line, but is fast and simple.
#[typst_macros::time]
fn linebreak_simple<'a>(
engine: &Engine,
p: &'a Preparation<'a>,
width: Abs,
) -> Vec<Line<'a>> {
let mut lines = Vec::with_capacity(16);
let mut start = 0;
let mut last = None;
breakpoints(p, |end, breakpoint| {
// Compute the line and its size.
let mut attempt = line(engine, p, start..end, breakpoint, lines.last());
// If the line doesn't fit anymore, we push the last fitting attempt
// into the stack and rebuild the line from the attempt's end. The
// resulting line cannot be broken up further.
if !width.fits(attempt.width) {
if let Some((last_attempt, last_end)) = last.take() {
lines.push(last_attempt);
start = last_end;
attempt = line(engine, p, start..end, breakpoint, lines.last());
}
}
// Finish the current line if there is a mandatory line break (i.e. due
// to "\n") or if the line doesn't fit horizontally already since then
// no shorter line will be possible.
if breakpoint == Breakpoint::Mandatory || !width.fits(attempt.width) {
lines.push(attempt);
start = end;
last = None;
} else {
last = Some((attempt, end));
}
});
if let Some((line, _)) = last {
lines.push(line);
}
lines
}
/// Performs line breaking in optimized Knuth-Plass style. Here, we use more
/// context to determine the line breaks than in the simple first-fit style. For
/// example, we might choose to cut a line short even though there is still a
/// bit of space to improve the fit of one of the following lines. The
/// Knuth-Plass algorithm is based on the idea of "cost". A line which has a
/// very tight or very loose fit has a higher cost than one that is just right.
/// Ending a line with a hyphen incurs extra cost and endings two successive
/// lines with hyphens even more.
///
/// To find the layout with the minimal total cost the algorithm uses dynamic
/// programming: For each possible breakpoint, it determines the optimal layout
/// _up to that point_. It walks over all possible start points for a line
/// ending at that point and finds the one for which the cost of the line plus
/// the cost of the optimal layout up to the start point (already computed and
/// stored in dynamic programming table) is minimal. The final result is simply
/// the layout determined for the last breakpoint at the end of text.
#[typst_macros::time]
fn linebreak_optimized<'a>(
engine: &Engine,
p: &'a Preparation<'a>,
width: Abs,
) -> Vec<Line<'a>> {
let metrics = CostMetrics::compute(p);
// Determines the exact costs of a likely good layout through Knuth-Plass
// with approximate metrics. We can use this cost as an upper bound to prune
// the search space in our proper optimization pass below.
let upper_bound = linebreak_optimized_approximate(engine, p, width, &metrics);
// Using the upper bound, perform exact optimized linebreaking.
linebreak_optimized_bounded(engine, p, width, &metrics, upper_bound)
}
/// Performs line breaking in optimized Knuth-Plass style, but with an upper
/// bound on the cost. This allows us to skip many parts of the search space.
#[typst_macros::time]
fn linebreak_optimized_bounded<'a>(
engine: &Engine,
p: &'a Preparation<'a>,
width: Abs,
metrics: &CostMetrics,
upper_bound: Cost,
) -> Vec<Line<'a>> {
/// An entry in the dynamic programming table for inline layout optimization.
struct Entry<'a> {
pred: usize,
total: Cost,
line: Line<'a>,
end: usize,
}
// Dynamic programming table.
let mut table = vec![Entry { pred: 0, total: 0.0, line: Line::empty(), end: 0 }];
let mut active = 0;
let mut prev_end = 0;
breakpoints(p, |end, breakpoint| {
// Find the optimal predecessor.
let mut best: Option<Entry> = None;
// A lower bound for the cost of all following line attempts.
let mut line_lower_bound = None;
for (pred_index, pred) in table.iter().enumerate().skip(active) {
let start = pred.end;
let unbreakable = prev_end == start;
// If the minimum cost we've established for the line is already
// too much, skip this attempt.
if line_lower_bound
.is_some_and(|lower| pred.total + lower > upper_bound + BOUND_EPS)
{
continue;
}
// Build the line.
let attempt = line(engine, p, start..end, breakpoint, Some(&pred.line));
// Determine the cost of the line and its stretch ratio.
let (line_ratio, line_cost) = ratio_and_cost(
p,
metrics,
width,
&pred.line,
&attempt,
breakpoint,
unbreakable,
);
// If the line is overfull, we adjust the set of active candidate
// line starts. This is the case if
// - justification is on, but we'd need to shrink too much
// - justification is off and the line just doesn't fit
//
// If this is the earliest breakpoint in the active set
// (active == i), remove it from the active set. If there is an
// earlier one (active < i), then the logically shorter line was
// in fact longer (can happen with negative spacing) and we
// can't trim the active set just yet.
if line_ratio < metrics.min_ratio && active == pred_index {
active += 1;
}
// The total cost of this line and its chain of predecessors.
let total = pred.total + line_cost;
// If the line is already underfull (`line_ratio > 0`), any shorter
// slice of the line will be even more underfull. So it'll only get
// worse from here and further attempts would also have a cost
// exceeding `bound`. There is one exception: When the line has
// negative spacing, we can't know for sure, so we don't assign the
// lower bound in that case.
if line_ratio > 0.0
&& line_lower_bound.is_none()
&& !attempt.has_negative_width_items()
{
line_lower_bound = Some(line_cost);
}
// If the cost already exceeds the upper bound, we don't need to
// integrate this result into the table.
if total > upper_bound + BOUND_EPS {
continue;
}
// If this attempt is better than what we had before, take it!
if best.as_ref().is_none_or(|best| best.total >= total) {
best = Some(Entry { pred: pred_index, total, line: attempt, end });
}
}
// If this is a mandatory break, all breakpoints before this one become
// inactive since no line can span over the mandatory break.
if breakpoint == Breakpoint::Mandatory {
active = table.len();
}
table.extend(best);
prev_end = end;
});
// Retrace the best path.
let mut lines = Vec::with_capacity(16);
let mut idx = table.len() - 1;
// This should only happen if our bound was faulty. Which shouldn't happen!
if table[idx].end != p.text.len() {
#[cfg(debug_assertions)]
panic!("bounded inline layout is incomplete");
#[cfg(not(debug_assertions))]
return linebreak_optimized_bounded(engine, p, width, metrics, Cost::INFINITY);
}
while idx != 0 {
table.truncate(idx + 1);
let entry = table.pop().unwrap();
lines.push(entry.line);
idx = entry.pred;
}
lines.reverse();
lines
}
/// Runs the normal Knuth-Plass algorithm, but instead of building proper lines
/// (which is costly) to determine costs, it determines approximate costs using
/// cumulative arrays.
///
/// This results in a likely good inline layouts, for which we then compute
/// the exact cost. This cost is an upper bound for proper optimized
/// linebreaking. We can use it to heavily prune the search space.
#[typst_macros::time]
fn linebreak_optimized_approximate(
engine: &Engine,
p: &Preparation,
width: Abs,
metrics: &CostMetrics,
) -> Cost {
// Determine the cumulative estimation metrics.
let estimates = Estimates::compute(p);
/// An entry in the dynamic programming table for inline layout optimization.
struct Entry {
pred: usize,
total: Cost,
end: usize,
unbreakable: bool,
breakpoint: Breakpoint,
}
// Dynamic programming table.
let mut table = vec![Entry {
pred: 0,
total: 0.0,
end: 0,
unbreakable: false,
breakpoint: Breakpoint::Mandatory,
}];
let mut active = 0;
let mut prev_end = 0;
breakpoints(p, |end, breakpoint| {
// Find the optimal predecessor.
let mut best: Option<Entry> = None;
for (pred_index, pred) in table.iter().enumerate().skip(active) {
let start = pred.end;
let unbreakable = prev_end == start;
// Whether the line is justified. This is not 100% accurate w.r.t
// to line()'s behaviour, but good enough.
let justify = p.config.justify && breakpoint != Breakpoint::Mandatory;
// We don't really know whether the line naturally ends with a dash
// here, so we can miss that case, but it's ok, since all of this
// just an estimate.
let consecutive_dash = pred.breakpoint.is_hyphen() && breakpoint.is_hyphen();
// Estimate how much the line's spaces would need to be stretched to
// make it the desired width. We trim at the end to not take into
// account trailing spaces. This is, again, only an approximation of
// the real behaviour of `line`.
let trimmed_end = start + p.text[start..end].trim_end().len();
let line_ratio = raw_ratio(
p,
width,
estimates.widths.estimate(start..trimmed_end)
+ if breakpoint.is_hyphen() {
metrics.approx_hyphen_width
} else {
Abs::zero()
},
estimates.stretchability.estimate(start..trimmed_end),
estimates.shrinkability.estimate(start..trimmed_end),
estimates.justifiables.estimate(start..trimmed_end),
);
// Determine the line's cost.
let line_cost = raw_cost(
metrics,
breakpoint,
line_ratio,
justify,
unbreakable,
consecutive_dash,
true,
);
// Adjust the set of active breakpoints.
// See `linebreak_optimized` for details.
if line_ratio < metrics.min_ratio && active == pred_index {
active += 1;
}
// The total cost of this line and its chain of predecessors.
let total = pred.total + line_cost;
// If this attempt is better than what we had before, take it!
if best.as_ref().is_none_or(|best| best.total >= total) {
best = Some(Entry {
pred: pred_index,
total,
end,
unbreakable,
breakpoint,
});
}
}
// If this is a mandatory break, all breakpoints before this one become
// inactive.
if breakpoint == Breakpoint::Mandatory {
active = table.len();
}
table.extend(best);
prev_end = end;
});
// Retrace the best path.
let mut indices = Vec::with_capacity(16);
let mut idx = table.len() - 1;
while idx != 0 {
indices.push(idx);
idx = table[idx].pred;
}
let mut pred = Line::empty();
let mut start = 0;
let mut exact = 0.0;
// The cost that we optimized was only an approximate cost, so the layout we
// got here is only likely to be good, not guaranteed to be the best. We now
// computes its exact cost as that gives us a sound upper bound for the
// proper optimization pass.
for idx in indices.into_iter().rev() {
let Entry { end, breakpoint, unbreakable, .. } = table[idx];
let attempt = line(engine, p, start..end, breakpoint, Some(&pred));
let (ratio, line_cost) =
ratio_and_cost(p, metrics, width, &pred, &attempt, breakpoint, unbreakable);
// If approximation produces a valid layout without too much shrinking,
// exact layout is guaranteed to find the same layout. If, however, the
// line is overfull, we do not have this guarantee. Then, our bound
// becomes useless and actively harmful (it could be lower than what
// optimal layout produces). Thus, we immediately bail with an infinite
// bound in this case.
if ratio < metrics.min_ratio {
return Cost::INFINITY;
}
pred = attempt;
start = end;
exact += line_cost;
}
exact
}
/// Compute the stretch ratio and cost of a line.
#[allow(clippy::too_many_arguments)]
fn ratio_and_cost(
p: &Preparation,
metrics: &CostMetrics,
available_width: Abs,
pred: &Line,
attempt: &Line,
breakpoint: Breakpoint,
unbreakable: bool,
) -> (f64, Cost) {
let ratio = raw_ratio(
p,
available_width,
attempt.width,
attempt.stretchability(),
attempt.shrinkability(),
attempt.justifiables(),
);
let cost = raw_cost(
metrics,
breakpoint,
ratio,
attempt.justify,
unbreakable,
pred.dash.is_some() && attempt.dash.is_some(),
false,
);
(ratio, cost)
}
/// Determine the stretch ratio for a line given raw metrics.
///
/// - A ratio < min_ratio indicates an overfull line.
/// - A negative ratio indicates a line that needs shrinking.
/// - A ratio of zero indicates a perfect line.
/// - A positive ratio indicates a line that needs stretching.
fn raw_ratio(
p: &Preparation,
available_width: Abs,
line_width: Abs,
stretchability: Abs,
shrinkability: Abs,
justifiables: usize,
) -> f64 {
// Determine how much the line's spaces would need to be stretched
// to make it the desired width.
let mut delta = available_width - line_width;
// Avoid possible floating point errors in previous calculation.
if delta.approx_eq(Abs::zero()) {
delta = Abs::zero();
}
// Determine how much stretch or shrink is natural.
let adjustability = if delta >= Abs::zero() { stretchability } else { shrinkability };
// Observations:
// - `delta` is negative for a line that needs shrinking and positive for a
// line that needs stretching.
// - `adjustability` must be non-negative to make sense.
// - `ratio` inherits the sign of `delta`.
let mut ratio = delta / adjustability.max(Abs::zero());
// The most likely cause of a NaN result is that `delta` was zero. This
// often happens with monospace fonts and CJK texts. It means that the line
// already fits perfectly, so `ratio` should be zero then.
if ratio.is_nan() {
ratio = 0.0;
}
// If the ratio exceeds 1, we should stretch above the natural
// stretchability using justifiables.
if ratio > 1.0 {
// We should stretch the line above its stretchability. Now
// calculate the extra amount. Also, don't divide by zero.
let extra_stretch = (delta - adjustability) / justifiables.max(1) as f64;
// Normalize the amount by half the em size.
ratio = 1.0 + extra_stretch / (p.config.font_size / 2.0);
}
// The min value must be < MIN_RATIO, but how much smaller doesn't matter
// since overfull lines have hard-coded huge costs anyway.
//
// The max value is clamped to 10 since it doesn't really matter whether a
// line is stretched 10x or 20x.
ratio.clamp(MIN_RATIO - 1.0, 10.0)
}
/// Compute the cost of a line given raw metrics.
///
/// This mostly follows the formula in the Knuth-Plass paper, but there are some
/// adjustments.
fn raw_cost(
metrics: &CostMetrics,
breakpoint: Breakpoint,
ratio: f64,
justify: bool,
unbreakable: bool,
consecutive_dash: bool,
approx: bool,
) -> Cost {
// Determine the stretch/shrink cost of the line.
let badness = if ratio < metrics.min_ratio(approx) {
// Overfull line always has maximum cost.
1_000_000.0
} else if breakpoint != Breakpoint::Mandatory || justify || ratio < 0.0 {
// If the line shall be justified or needs shrinking, it has normal
// badness with cost 100|ratio|^3. We limit the ratio to 10 as to not
// get to close to our maximum cost.
100.0 * ratio.abs().powi(3)
} else {
// If the line shouldn't be justified and doesn't need shrink, we don't
// pay any cost.
0.0
};
// Compute penalties.
let mut penalty = 0.0;
// Penalize runts (lone words before a mandatory break / at the end).
if unbreakable && breakpoint == Breakpoint::Mandatory {
penalty += metrics.runt_cost;
}
// Penalize hyphenation.
if let Breakpoint::Hyphen(l, r) = breakpoint {
// We penalize hyphenations close to the edges of the word (< LIMIT
// chars) extra. For each step of distance from the limit, we add 15%
// to the cost.
const LIMIT: u8 = 5;
let steps = LIMIT.saturating_sub(l) + LIMIT.saturating_sub(r);
let extra = 0.15 * steps as f64;
penalty += (1.0 + extra) * metrics.hyph_cost;
}
// Penalize two consecutive dashes extra (not necessarily hyphens).
// Knuth-Plass does this separately after the squaring, with a higher cost,
// but I couldn't find any explanation as to why.
if consecutive_dash {
penalty += metrics.hyph_cost;
}
// From the Knuth-Plass Paper: $ (1 + beta_j + pi_j)^2 $.
//
// We add one to minimize the number of lines when everything else is more
// or less equal.
(1.0 + badness + penalty).powi(2)
}
/// Calls `f` for all possible points in the text where lines can broken.
///
/// Yields for each breakpoint the text index, whether the break is mandatory
/// (after `\n`) and whether a hyphen is required (when breaking inside of a
/// word).
///
/// This is an internal instead of an external iterator because it makes the
/// code much simpler and the consumers of this function don't need the
/// composability and flexibility of external iteration anyway.
fn breakpoints(p: &Preparation, mut f: impl FnMut(usize, Breakpoint)) {
let text = p.text;
// Single breakpoint at the end for empty text.
if text.is_empty() {
f(0, Breakpoint::Mandatory);
return;
}
let hyphenate = p.config.hyphenate != Some(false);
let lb = LINEBREAK_DATA.as_borrowed();
let segmenter = match p.config.lang {
Some(Lang::CHINESE | Lang::JAPANESE) => &CJ_SEGMENTER,
_ => &SEGMENTER,
};
let mut last = 0;
let mut iter = segmenter.segment_str(text).peekable();
loop {
// Special case for links. UAX #14 doesn't handle them well.
let (head, tail) = text.split_at(last);
if head.ends_with("://") || tail.starts_with("www.") {
let (link, _) = link_prefix(tail);
linebreak_link(link, |i| f(last + i, Breakpoint::Normal));
last += link.len();
while iter.peek().is_some_and(|&p| p < last) {
iter.next();
}
}
// Get the next UAX #14 linebreak opportunity.
let Some(point) = iter.next() else { break };
// Skip breakpoint if there is no char before it. icu4x generates one
// at offset 0, but we don't want it.
let Some(c) = text[..point].chars().next_back() else { continue };
// Find out whether the last break was mandatory by checking against
// rules LB4 and LB5, special-casing the end of text according to LB3.
// See also: https://docs.rs/icu_segmenter/latest/icu_segmenter/struct.LineSegmenter.html
let breakpoint = if point == text.len() {
Breakpoint::Mandatory
} else {
match lb.get(c) {
// Fix for: https://github.com/unicode-org/icu4x/issues/4146
LineBreak::Glue | LineBreak::WordJoiner | LineBreak::ZWJ => continue,
LineBreak::MandatoryBreak
| LineBreak::CarriageReturn
| LineBreak::LineFeed
| LineBreak::NextLine => Breakpoint::Mandatory,
_ => Breakpoint::Normal,
}
};
// Hyphenate between the last and current breakpoint.
if hyphenate && last < point {
for segment in text[last..point].split_word_bounds() {
if !segment.is_empty() && segment.chars().all(char::is_alphabetic) {
hyphenations(p, &lb, last, segment, &mut f);
}
last += segment.len();
}
}
// Call `f` for the UAX #14 break opportunity.
f(point, breakpoint);
last = point;
}
}
/// Generate breakpoints for hyphenations within a word.
fn hyphenations(
p: &Preparation,
lb: &CodePointMapDataBorrowed<LineBreak>,
mut offset: usize,
word: &str,
mut f: impl FnMut(usize, Breakpoint),
) {
let Some(lang) = lang_at(p, offset) else { return };
let count = word.chars().count();
let end = offset + word.len();
let mut chars = 0;
for syllable in hypher::hyphenate(word, lang) {
offset += syllable.len();
chars += syllable.chars().count();
// Don't hyphenate after the final syllable.
if offset == end {
continue;
}
// Filter out hyphenation opportunities where hyphenation was actually
// disabled.
if !hyphenate_at(p, offset) {
continue;
}
// Filter out forbidden hyphenation opportunities.
if matches!(
syllable.chars().next_back().map(|c| lb.get(c)),
Some(LineBreak::Glue | LineBreak::WordJoiner | LineBreak::ZWJ)
) {
continue;
}
// Determine the number of codepoints before and after the hyphenation.
let l = chars.saturating_as::<u8>();
let r = (count - chars).saturating_as::<u8>();
// Call `f` for the word-internal hyphenation opportunity.
f(offset, Breakpoint::Hyphen(l, r));
}
}
/// Produce linebreak opportunities for a link.
fn linebreak_link(link: &str, mut f: impl FnMut(usize)) {
#[derive(PartialEq)]
enum Class {
Alphabetic,
Digit,
Open,
Other,
}
impl Class {
fn of(c: char) -> Self {
if c.is_alphabetic() {
Class::Alphabetic
} else if c.is_numeric() {
Class::Digit
} else if matches!(c, '(' | '[') {
Class::Open
} else {
Class::Other
}
}
}
let mut offset = 0;
let mut prev = Class::Other;
for (end, c) in link.char_indices() {
let class = Class::of(c);
// Emit opportunities when going from
// - other -> other
// - alphabetic -> numeric
// - numeric -> alphabetic
// Never before/after opening delimiters.
if end > 0
&& prev != Class::Open
&& if class == Class::Other { prev == Class::Other } else { class != prev }
{
let piece = &link[offset..end];
if piece.len() < 16 {
// For bearably long segments, emit them as one.
offset = end;
f(offset);
} else {
// If it gets very long (e.g. a hash in the URL), just allow a
// break at every char.
for c in piece.chars() {
offset += c.len_utf8();
f(offset);
}
}
}
prev = class;
}
}
/// Whether hyphenation is enabled at the given offset.
fn hyphenate_at(p: &Preparation, offset: usize) -> bool {
p.config.hyphenate.unwrap_or_else(|| {
let (_, item) = p.get(offset);
match item.text() {
Some(text) => TextElem::hyphenate_in(text.styles).unwrap_or(p.config.justify),
None => false,
}
})
}
/// The text language at the given offset.
fn lang_at(p: &Preparation, offset: usize) -> Option<hypher::Lang> {
let lang = p.config.lang.or_else(|| {
let (_, item) = p.get(offset);
let styles = item.text()?.styles;
Some(TextElem::lang_in(styles))
})?;
let bytes = lang.as_str().as_bytes().try_into().ok()?;
hypher::Lang::from_iso(bytes)
}
/// Resolved metrics relevant for cost computation.
struct CostMetrics {
min_ratio: f64,
min_approx_ratio: f64,
approx_hyphen_width: Abs,
hyph_cost: Cost,
runt_cost: Cost,
}
impl CostMetrics {
/// Compute shared metrics for inline layout optimization.
fn compute(p: &Preparation) -> Self {
Self {
// When justifying, we may stretch spaces below their natural width.
min_ratio: if p.config.justify { MIN_RATIO } else { 0.0 },
min_approx_ratio: if p.config.justify { MIN_APPROX_RATIO } else { 0.0 },
// Approximate hyphen width for estimates.
approx_hyphen_width: Em::new(0.33).at(p.config.font_size),
// Costs.
hyph_cost: DEFAULT_HYPH_COST * p.config.costs.hyphenation().get(),
runt_cost: DEFAULT_RUNT_COST * p.config.costs.runt().get(),
}
}
/// The minimum line ratio we allow for shrinking. For approximate layout,
/// we allow less because otherwise we get an invalid layout fairly often,
/// which makes our bound useless.
fn min_ratio(&self, approx: bool) -> f64 {
if approx {
self.min_approx_ratio
} else {
self.min_ratio
}
}
}
/// Estimated line metrics.
///
/// Allows to get a quick estimate of a metric for a line between two byte
/// positions.
struct Estimates {
widths: CumulativeVec<Abs>,
stretchability: CumulativeVec<Abs>,
shrinkability: CumulativeVec<Abs>,
justifiables: CumulativeVec<usize>,
}
impl Estimates {
/// Compute estimations for approximate Knuth-Plass layout.
fn compute(p: &Preparation) -> Self {
let cap = p.text.len();
let mut widths = CumulativeVec::with_capacity(cap);
let mut stretchability = CumulativeVec::with_capacity(cap);
let mut shrinkability = CumulativeVec::with_capacity(cap);
let mut justifiables = CumulativeVec::with_capacity(cap);
for (range, item) in p.items.iter() {
if let Item::Text(shaped) = item {
for g in shaped.glyphs.iter() {
let byte_len = g.range.len();
let stretch = g.stretchability().0 + g.stretchability().1;
let shrink = g.shrinkability().0 + g.shrinkability().1;
widths.push(byte_len, g.x_advance.at(shaped.size));
stretchability.push(byte_len, stretch.at(shaped.size));
shrinkability.push(byte_len, shrink.at(shaped.size));
justifiables.push(byte_len, g.is_justifiable() as usize);
}
} else {
widths.push(range.len(), item.natural_width());
}
widths.adjust(range.end);
stretchability.adjust(range.end);
shrinkability.adjust(range.end);
justifiables.adjust(range.end);
}
Self {
widths,
stretchability,
shrinkability,
justifiables,
}
}
}
/// An accumulative array of a metric.
struct CumulativeVec<T> {
total: T,
summed: Vec<T>,
}
impl<T> CumulativeVec<T>
where
T: Default + Copy + Add<Output = T> + Sub<Output = T>,
{
/// Create a new instance with the given capacity.
fn with_capacity(capacity: usize) -> Self {
let total = T::default();
let mut summed = Vec::with_capacity(capacity);
summed.push(total);
Self { total, summed }
}
/// Adjust to cover the given byte length.
fn adjust(&mut self, len: usize) {
self.summed.resize(len, self.total);
}
/// Adds a new segment with the given byte length and metric.
fn push(&mut self, byte_len: usize, metric: T) {
self.total = self.total + metric;
for _ in 0..byte_len {
self.summed.push(self.total);
}
}
/// Estimates the metrics for the line spanned by the range.
#[track_caller]
fn estimate(&self, range: Range) -> T {
self.get(range.end) - self.get(range.start)
}
/// Get the metric at the given byte position.
#[track_caller]
fn get(&self, index: usize) -> T {
match index.checked_sub(1) {
None => T::default(),
Some(i) => self.summed[i],
}
}
}
|