1 files changed, 476 insertions, 0 deletions
diff --git a/src/font/book.rs b/src/font/book.rs
new file mode 100644
index 00000000..8f19faf7
--- /dev/null
+++ b/src/font/book.rs
@@ -0,0 +1,476 @@
+use std::cmp::Reverse;
+use std::collections::BTreeMap;
+
+use serde::{Deserialize, Serialize};
+use ttf_parser::{name_id, PlatformId, Tag};
+use unicode_segmentation::UnicodeSegmentation;
+
+use super::{Font, FontStretch, FontStyle, FontVariant, FontWeight};
+
+/// Metadata about a collection of fonts.
+#[derive(Default)]
+pub struct FontBook {
+    /// Maps from lowercased family names to font indices.
+    families: BTreeMap<String, Vec<usize>>,
+    /// Metadata about each font in the collection.
+    infos: Vec<FontInfo>,
+}
+
+impl FontBook {
+    /// Create a new, empty font book.
+    pub fn new() -> Self {
+        Self { families: BTreeMap::new(), infos: vec![] }
+    }
+
+    /// Create a font book for a collection of fonts.
+    pub fn from_fonts<'a>(fonts: impl IntoIterator<Item = &'a Font>) -> Self {
+        let mut book = Self::new();
+        for font in fonts {
+            book.push(font.info().clone());
+        }
+        book
+    }
+
+    /// Insert metadata into the font book.
+    pub fn push(&mut self, info: FontInfo) {
+        let index = self.infos.len();
+        let family = info.family.to_lowercase();
+        self.families.entry(family).or_default().push(index);
+        self.infos.push(info);
+    }
+
+    /// An ordered iterator over all font families this loader knows and details
+    /// about the faces that are part of them.
+    pub fn families(
+        &self,
+    ) -> impl Iterator<Item = (&str, impl Iterator<Item = &FontInfo>)> + '_ {
+        // Since the keys are lowercased, we instead use the family field of the
+        // first face's info.
+        self.families.values().map(|ids| {
+            let family = self.infos[ids[0]].family.as_str();
+            let infos = ids.iter().map(|&id| &self.infos[id]);
+            (family, infos)
+        })
+    }
+
+    /// Try to find and load a font from the given `family` that matches
+    /// the given `variant` as closely as possible.
+    ///
+    /// The `family` should be all lowercase.
+    pub fn select(&self, family: &str, variant: FontVariant) -> Option<usize> {
+        let ids = self.families.get(family)?;
+        self.find_best_variant(None, variant, ids.iter().copied())
+    }
+
+    /// Try to find and load a fallback font that
+    /// - is as close as possible to the font `like` (if any)
+    /// - is as close as possible to the given `variant`
+    /// - is suitable for shaping the given `text`
+    pub fn select_fallback(
+        &self,
+        like: Option<&FontInfo>,
+        variant: FontVariant,
+        text: &str,
+    ) -> Option<usize> {
+        // Find the fonts that contain the text's first char ...
+        let c = text.chars().next()?;
+        let ids = self
+            .infos
+            .iter()
+            .enumerate()
+            .filter(|(_, info)| info.coverage.contains(c as u32))
+            .map(|(index, _)| index);
+
+        // ... and find the best variant among them.
+        self.find_best_variant(like, variant, ids)
+    }
+
+    /// Find the font in the passed iterator that
+    /// - is closest to the font `like` (if any)
+    /// - is closest to the given `variant`
+    ///
+    /// To do that we compute a key for all variants and select the one with the
+    /// minimal key. This key prioritizes:
+    /// - If `like` is some other font:
+    ///   - Are both fonts (not) monospaced?
+    ///   - Do both fonts (not) have serifs?
+    ///   - How many words do the families share in their prefix? E.g. "Noto
+    ///     Sans" and "Noto Sans Arabic" share two words, whereas "IBM Plex
+    ///     Arabic" shares none with "Noto Sans", so prefer "Noto Sans Arabic"
+    ///     if `like` is "Noto Sans". In case there are two equally good
+    ///     matches, we prefer the shorter one because it is less special (e.g.
+    ///     if `like` is "Noto Sans Arabic", we prefer "Noto Sans" over "Noto
+    ///     Sans CJK HK".)
+    /// - The style (normal / italic / oblique). If we want italic or oblique
+    ///   but it doesn't exist, the other one of the two is still better than
+    ///   normal.
+    /// - The absolute distance to the target stretch.
+    /// - The absolute distance to the target weight.
+    fn find_best_variant(
+        &self,
+        like: Option<&FontInfo>,
+        variant: FontVariant,
+        ids: impl IntoIterator<Item = usize>,
+    ) -> Option<usize> {
+        let mut best = None;
+        let mut best_key = None;
+
+        for id in ids {
+            let current = &self.infos[id];
+            let key = (
+                like.map(|like| {
+                    (
+                        current.flags.contains(FontFlags::MONOSPACE)
+                            != like.flags.contains(FontFlags::MONOSPACE),
+                        current.flags.contains(FontFlags::SERIF)
+                            != like.flags.contains(FontFlags::SERIF),
+                        Reverse(shared_prefix_words(&current.family, &like.family)),
+                        current.family.len(),
+                    )
+                }),
+                current.variant.style.distance(variant.style),
+                current.variant.stretch.distance(variant.stretch),
+                current.variant.weight.distance(variant.weight),
+            );
+
+            if best_key.map_or(true, |b| key < b) {
+                best = Some(id);
+                best_key = Some(key);
+            }
+        }
+
+        best
+    }
+}
+
+/// Properties of a single font.
+#[derive(Debug, Clone, Eq, PartialEq, Serialize, Deserialize)]
+pub struct FontInfo {
+    /// The typographic font family this font is part of.
+    pub family: String,
+    /// Properties that distinguish this font from other fonts in the same
+    /// family.
+    pub variant: FontVariant,
+    /// Properties of the font.
+    pub flags: FontFlags,
+    /// The unicode coverage of the font.
+    pub coverage: Coverage,
+}
+
+bitflags::bitflags! {
+    /// Bitflags describing characteristics of a font.
+    #[derive(Serialize, Deserialize)]
+    pub struct FontFlags: u32 {
+        /// All glyphs have the same width.
+        const MONOSPACE = 1 << 0;
+        /// Glyphs have short strokes at their stems.
+        const SERIF = 1 << 1;
+    }
+}
+
+impl FontInfo {
+    /// Compute metadata for all fonts in the given data.
+    pub fn from_data<'a>(data: &'a [u8]) -> impl Iterator<Item = FontInfo> + 'a {
+        let count = ttf_parser::fonts_in_collection(data).unwrap_or(1);
+        (0 .. count).filter_map(move |index| {
+            let ttf = ttf_parser::Face::from_slice(data, index).ok()?;
+            Self::from_ttf(&ttf)
+        })
+    }
+
+    /// Compute metadata for a single ttf-parser face.
+    pub fn from_ttf(ttf: &ttf_parser::Face) -> Option<Self> {
+        // We cannot use Name ID 16 "Typographic Family", because for some
+        // fonts it groups together more than just Style / Weight / Stretch
+        // variants (e.g. Display variants of Noto fonts) and then some
+        // variants become inaccessible from Typst. And even though the
+        // fsSelection bit WWS should help us decide whether that is the
+        // case, it's wrong for some fonts (e.g. for certain variants of "Noto
+        // Sans Display").
+        //
+        // So, instead we use Name ID 1 "Family" and trim many common
+        // suffixes for which know that they just describe styling (e.g.
+        // "ExtraBold").
+        //
+        // Also, for Noto fonts we use Name ID 4 "Full Name" instead,
+        // because Name ID 1 "Family" sometimes contains "Display" and
+        // sometimes doesn't for the Display variants and that mixes things
+        // up.
+        let family = {
+            let mut family = find_name(ttf, name_id::FAMILY)?;
+            if family.starts_with("Noto") {
+                family = find_name(ttf, name_id::FULL_NAME)?;
+            }
+            typographic_family(&family).to_string()
+        };
+
+        let variant = {
+            let mut full = find_name(ttf, name_id::FULL_NAME).unwrap_or_default();
+            full.make_ascii_lowercase();
+
+            // Some fonts miss the relevant bits for italic or oblique, so
+            // we also try to infer that from the full name.
+            let italic = ttf.is_italic() || full.contains("italic");
+            let oblique =
+                ttf.is_oblique() || full.contains("oblique") || full.contains("slanted");
+
+            let style = match (italic, oblique) {
+                (false, false) => FontStyle::Normal,
+                (true, _) => FontStyle::Italic,
+                (_, true) => FontStyle::Oblique,
+            };
+
+            let weight = FontWeight::from_number(ttf.weight().to_number());
+            let stretch = FontStretch::from_number(ttf.width().to_number());
+
+            FontVariant { style, weight, stretch }
+        };
+
+        // Determine the unicode coverage.
+        let mut codepoints = vec![];
+        for subtable in ttf.tables().cmap.into_iter().flat_map(|table| table.subtables) {
+            if subtable.is_unicode() {
+                subtable.codepoints(|c| codepoints.push(c));
+            }
+        }
+
+        let mut flags = FontFlags::empty();
+        flags.set(FontFlags::MONOSPACE, ttf.is_monospaced());
+
+        // Determine whether this is a serif or sans-serif font.
+        if let Some(panose) = ttf
+            .table_data(Tag::from_bytes(b"OS/2"))
+            .and_then(|os2| os2.get(32 .. 45))
+        {
+            if matches!(panose, [2, 2 ..= 10, ..]) {
+                flags.insert(FontFlags::SERIF);
+            }
+        }
+
+        Some(FontInfo {
+            family,
+            variant,
+            flags,
+            coverage: Coverage::from_vec(codepoints),
+        })
+    }
+}
+
+/// Try to find and decode the name with the given id.
+pub(super) fn find_name(ttf: &ttf_parser::Face, name_id: u16) -> Option<String> {
+    ttf.names().into_iter().find_map(|entry| {
+        if entry.name_id == name_id {
+            if let Some(string) = entry.to_string() {
+                return Some(string);
+            }
+
+            if entry.platform_id == PlatformId::Macintosh && entry.encoding_id == 0 {
+                return Some(decode_mac_roman(entry.name));
+            }
+        }
+
+        None
+    })
+}
+
+/// Decode mac roman encoded bytes into a string.
+fn decode_mac_roman(coded: &[u8]) -> String {
+    #[rustfmt::skip]
+    const TABLE: [char; 128] = [
+        'Ä', 'Å', 'Ç', 'É', 'Ñ', 'Ö', 'Ü', 'á', 'à', 'â', 'ä', 'ã', 'å', 'ç', 'é', 'è',
+        'ê', 'ë', 'í', 'ì', 'î', 'ï', 'ñ', 'ó', 'ò', 'ô', 'ö', 'õ', 'ú', 'ù', 'û', 'ü',
+        '†', '°', '¢', '£', '§', '•', '¶', 'ß', '®', '©', '™', '´', '¨', '≠', 'Æ', 'Ø',
+        '∞', '±', '≤', '≥', '¥', 'µ', '∂', '∑', '∏', 'π', '∫', 'ª', 'º', 'Ω', 'æ', 'ø',
+        '¿', '¡', '¬', '√', 'ƒ', '≈', '∆', '«', '»', '…', '\u{a0}', 'À', 'Ã', 'Õ', 'Œ', 'œ',
+        '–', '—', '“', '”', '‘', '’', '÷', '◊', 'ÿ', 'Ÿ', '⁄', '€', '‹', '›', 'ﬁ', 'ﬂ',
+        '‡', '·', '‚', '„', '‰', 'Â', 'Ê', 'Á', 'Ë', 'È', 'Í', 'Î', 'Ï', 'Ì', 'Ó', 'Ô',
+        '\u{f8ff}', 'Ò', 'Ú', 'Û', 'Ù', 'ı', 'ˆ', '˜', '¯', '˘', '˙', '˚', '¸', '˝', '˛', 'ˇ',
+    ];
+
+    fn char_from_mac_roman(code: u8) -> char {
+        if code < 128 {
+            code as char
+        } else {
+            TABLE[(code - 128) as usize]
+        }
+    }
+
+    coded.iter().copied().map(char_from_mac_roman).collect()
+}
+
+/// Trim style naming from a family name.
+fn typographic_family(mut family: &str) -> &str {
+    // Separators between names, modifiers and styles.
+    const SEPARATORS: [char; 3] = [' ', '-', '_'];
+
+    // Modifiers that can appear in combination with suffixes.
+    const MODIFIERS: &[&str] = &[
+        "extra", "ext", "ex", "x", "semi", "sem", "sm", "demi", "dem", "ultra",
+    ];
+
+    // Style suffixes.
+    #[rustfmt::skip]
+    const SUFFIXES: &[&str] = &[
+        "normal", "italic", "oblique", "slanted",
+        "thin", "th", "hairline", "light", "lt", "regular", "medium", "med",
+        "md", "bold", "bd", "demi", "extb", "black", "blk", "bk", "heavy",
+        "narrow", "condensed", "cond", "cn", "cd", "compressed", "expanded", "exp"
+    ];
+
+    // Trim spacing and weird leading dots in Apple fonts.
+    family = family.trim().trim_start_matches('.');
+
+    // Lowercase the string so that the suffixes match case-insensitively.
+    let lower = family.to_ascii_lowercase();
+    let mut len = usize::MAX;
+    let mut trimmed = lower.as_str();
+
+    // Trim style suffixes repeatedly.
+    while trimmed.len() < len {
+        len = trimmed.len();
+
+        // Find style suffix.
+        let mut t = match SUFFIXES.iter().find_map(|s| trimmed.strip_suffix(s)) {
+            Some(t) => t,
+            None => break,
+        };
+
+        // Strip optional separator.
+        if let Some(s) = t.strip_suffix(SEPARATORS) {
+            trimmed = s;
+            t = s;
+        }
+
+        // Also allow an extra modifier, but apply it only if it is separated it
+        // from the text before it (to prevent false positives).
+        if let Some(t) = MODIFIERS.iter().find_map(|s| t.strip_suffix(s)) {
+            if let Some(stripped) = t.strip_suffix(SEPARATORS) {
+                trimmed = stripped;
+            }
+        }
+    }
+
+    &family[.. len]
+}
+
+/// How many words the two strings share in their prefix.
+fn shared_prefix_words(left: &str, right: &str) -> usize {
+    left.unicode_words()
+        .zip(right.unicode_words())
+        .take_while(|(l, r)| l == r)
+        .count()
+}
+
+/// A compactly encoded set of codepoints.
+///
+/// The set is represented by alternating specifications of how many codepoints
+/// are not in the set and how many are in the set.
+///
+/// For example, for the set `{2, 3, 4, 9, 10, 11, 15, 18, 19}`, there are:
+/// - 2 codepoints not inside (0, 1)
+/// - 3 codepoints inside (2, 3, 4)
+/// - 4 codepoints not inside (5, 6, 7, 8)
+/// - 3 codepoints inside (9, 10, 11)
+/// - 3 codepoints not inside (12, 13, 14)
+/// - 1 codepoint inside (15)
+/// - 2 codepoints not inside (16, 17)
+/// - 2 codepoints inside (18, 19)
+///
+/// So the resulting encoding is `[2, 3, 4, 3, 3, 1, 2, 2]`.
+#[derive(Debug, Clone, Eq, PartialEq, Serialize, Deserialize)]
+#[serde(transparent)]
+pub struct Coverage(Vec<u32>);
+
+impl Coverage {
+    /// Encode a vector of codepoints.
+    pub fn from_vec(mut codepoints: Vec<u32>) -> Self {
+        codepoints.sort();
+        codepoints.dedup();
+
+        let mut runs = Vec::new();
+        let mut next = 0;
+
+        for c in codepoints {
+            if let Some(run) = runs.last_mut().filter(|_| c == next) {
+                *run += 1;
+            } else {
+                runs.push(c - next);
+                runs.push(1);
+            }
+
+            next = c + 1;
+        }
+
+        Self(runs)
+    }
+
+    /// Whether the codepoint is covered.
+    pub fn contains(&self, c: u32) -> bool {
+        let mut inside = false;
+        let mut cursor = 0;
+
+        for &run in &self.0 {
+            if (cursor .. cursor + run).contains(&c) {
+                return inside;
+            }
+            cursor += run;
+            inside = !inside;
+        }
+
+        false
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_trim_styles() {
+        assert_eq!(typographic_family("Atma Light"), "Atma");
+        assert_eq!(typographic_family("eras bold"), "eras");
+        assert_eq!(typographic_family("footlight mt light"), "footlight mt");
+        assert_eq!(typographic_family("times new roman"), "times new roman");
+        assert_eq!(
+            typographic_family("noto sans mono cond sembd"),
+            "noto sans mono"
+        );
+        assert_eq!(typographic_family("noto serif SEMCOND sembd"), "noto serif");
+        assert_eq!(typographic_family("crimson text"), "crimson text");
+        assert_eq!(typographic_family("footlight light"), "footlight");
+        assert_eq!(typographic_family("Noto Sans"), "Noto Sans");
+        assert_eq!(typographic_family("Noto Sans Light"), "Noto Sans");
+        assert_eq!(
+            typographic_family("Noto Sans Semicondensed Heavy"),
+            "Noto Sans"
+        );
+        assert_eq!(typographic_family("Familx"), "Familx");
+        assert_eq!(typographic_family("Font Ultra"), "Font Ultra");
+        assert_eq!(typographic_family("Font Ultra Bold"), "Font");
+    }
+
+    #[test]
+    fn test_coverage() {
+        #[track_caller]
+        fn test(set: &[u32], runs: &[u32]) {
+            let coverage = Coverage::from_vec(set.to_vec());
+            assert_eq!(coverage.0, runs);
+
+            let max = 5 + set.iter().copied().max().unwrap_or_default();
+            for c in 0 .. max {
+                assert_eq!(set.contains(&c), coverage.contains(c));
+            }
+        }
+
+        test(&[], &[]);
+        test(&[0], &[0, 1]);
+        test(&[1], &[1, 1]);
+        test(&[0, 1], &[0, 2]);
+        test(&[0, 1, 3], &[0, 2, 1, 1]);
+        test(
+            // {2, 3, 4, 9, 10, 11, 15, 18, 19}
+            &[18, 19, 2, 4, 9, 11, 15, 3, 3, 10],
+            &[2, 3, 4, 3, 3, 1, 2, 2],
+        )
+    }
+}