diff options
Diffstat (limited to 'src/font/book.rs')
| -rw-r--r-- | src/font/book.rs | 476 |
1 files changed, 476 insertions, 0 deletions
diff --git a/src/font/book.rs b/src/font/book.rs new file mode 100644 index 00000000..8f19faf7 --- /dev/null +++ b/src/font/book.rs @@ -0,0 +1,476 @@ +use std::cmp::Reverse; +use std::collections::BTreeMap; + +use serde::{Deserialize, Serialize}; +use ttf_parser::{name_id, PlatformId, Tag}; +use unicode_segmentation::UnicodeSegmentation; + +use super::{Font, FontStretch, FontStyle, FontVariant, FontWeight}; + +/// Metadata about a collection of fonts. +#[derive(Default)] +pub struct FontBook { + /// Maps from lowercased family names to font indices. + families: BTreeMap<String, Vec<usize>>, + /// Metadata about each font in the collection. + infos: Vec<FontInfo>, +} + +impl FontBook { + /// Create a new, empty font book. + pub fn new() -> Self { + Self { families: BTreeMap::new(), infos: vec![] } + } + + /// Create a font book for a collection of fonts. + pub fn from_fonts<'a>(fonts: impl IntoIterator<Item = &'a Font>) -> Self { + let mut book = Self::new(); + for font in fonts { + book.push(font.info().clone()); + } + book + } + + /// Insert metadata into the font book. + pub fn push(&mut self, info: FontInfo) { + let index = self.infos.len(); + let family = info.family.to_lowercase(); + self.families.entry(family).or_default().push(index); + self.infos.push(info); + } + + /// An ordered iterator over all font families this loader knows and details + /// about the faces that are part of them. + pub fn families( + &self, + ) -> impl Iterator<Item = (&str, impl Iterator<Item = &FontInfo>)> + '_ { + // Since the keys are lowercased, we instead use the family field of the + // first face's info. + self.families.values().map(|ids| { + let family = self.infos[ids[0]].family.as_str(); + let infos = ids.iter().map(|&id| &self.infos[id]); + (family, infos) + }) + } + + /// Try to find and load a font from the given `family` that matches + /// the given `variant` as closely as possible. + /// + /// The `family` should be all lowercase. + pub fn select(&self, family: &str, variant: FontVariant) -> Option<usize> { + let ids = self.families.get(family)?; + self.find_best_variant(None, variant, ids.iter().copied()) + } + + /// Try to find and load a fallback font that + /// - is as close as possible to the font `like` (if any) + /// - is as close as possible to the given `variant` + /// - is suitable for shaping the given `text` + pub fn select_fallback( + &self, + like: Option<&FontInfo>, + variant: FontVariant, + text: &str, + ) -> Option<usize> { + // Find the fonts that contain the text's first char ... + let c = text.chars().next()?; + let ids = self + .infos + .iter() + .enumerate() + .filter(|(_, info)| info.coverage.contains(c as u32)) + .map(|(index, _)| index); + + // ... and find the best variant among them. + self.find_best_variant(like, variant, ids) + } + + /// Find the font in the passed iterator that + /// - is closest to the font `like` (if any) + /// - is closest to the given `variant` + /// + /// To do that we compute a key for all variants and select the one with the + /// minimal key. This key prioritizes: + /// - If `like` is some other font: + /// - Are both fonts (not) monospaced? + /// - Do both fonts (not) have serifs? + /// - How many words do the families share in their prefix? E.g. "Noto + /// Sans" and "Noto Sans Arabic" share two words, whereas "IBM Plex + /// Arabic" shares none with "Noto Sans", so prefer "Noto Sans Arabic" + /// if `like` is "Noto Sans". In case there are two equally good + /// matches, we prefer the shorter one because it is less special (e.g. + /// if `like` is "Noto Sans Arabic", we prefer "Noto Sans" over "Noto + /// Sans CJK HK".) + /// - The style (normal / italic / oblique). If we want italic or oblique + /// but it doesn't exist, the other one of the two is still better than + /// normal. + /// - The absolute distance to the target stretch. + /// - The absolute distance to the target weight. + fn find_best_variant( + &self, + like: Option<&FontInfo>, + variant: FontVariant, + ids: impl IntoIterator<Item = usize>, + ) -> Option<usize> { + let mut best = None; + let mut best_key = None; + + for id in ids { + let current = &self.infos[id]; + let key = ( + like.map(|like| { + ( + current.flags.contains(FontFlags::MONOSPACE) + != like.flags.contains(FontFlags::MONOSPACE), + current.flags.contains(FontFlags::SERIF) + != like.flags.contains(FontFlags::SERIF), + Reverse(shared_prefix_words(¤t.family, &like.family)), + current.family.len(), + ) + }), + current.variant.style.distance(variant.style), + current.variant.stretch.distance(variant.stretch), + current.variant.weight.distance(variant.weight), + ); + + if best_key.map_or(true, |b| key < b) { + best = Some(id); + best_key = Some(key); + } + } + + best + } +} + +/// Properties of a single font. +#[derive(Debug, Clone, Eq, PartialEq, Serialize, Deserialize)] +pub struct FontInfo { + /// The typographic font family this font is part of. + pub family: String, + /// Properties that distinguish this font from other fonts in the same + /// family. + pub variant: FontVariant, + /// Properties of the font. + pub flags: FontFlags, + /// The unicode coverage of the font. + pub coverage: Coverage, +} + +bitflags::bitflags! { + /// Bitflags describing characteristics of a font. + #[derive(Serialize, Deserialize)] + pub struct FontFlags: u32 { + /// All glyphs have the same width. + const MONOSPACE = 1 << 0; + /// Glyphs have short strokes at their stems. + const SERIF = 1 << 1; + } +} + +impl FontInfo { + /// Compute metadata for all fonts in the given data. + pub fn from_data<'a>(data: &'a [u8]) -> impl Iterator<Item = FontInfo> + 'a { + let count = ttf_parser::fonts_in_collection(data).unwrap_or(1); + (0 .. count).filter_map(move |index| { + let ttf = ttf_parser::Face::from_slice(data, index).ok()?; + Self::from_ttf(&ttf) + }) + } + + /// Compute metadata for a single ttf-parser face. + pub fn from_ttf(ttf: &ttf_parser::Face) -> Option<Self> { + // We cannot use Name ID 16 "Typographic Family", because for some + // fonts it groups together more than just Style / Weight / Stretch + // variants (e.g. Display variants of Noto fonts) and then some + // variants become inaccessible from Typst. And even though the + // fsSelection bit WWS should help us decide whether that is the + // case, it's wrong for some fonts (e.g. for certain variants of "Noto + // Sans Display"). + // + // So, instead we use Name ID 1 "Family" and trim many common + // suffixes for which know that they just describe styling (e.g. + // "ExtraBold"). + // + // Also, for Noto fonts we use Name ID 4 "Full Name" instead, + // because Name ID 1 "Family" sometimes contains "Display" and + // sometimes doesn't for the Display variants and that mixes things + // up. + let family = { + let mut family = find_name(ttf, name_id::FAMILY)?; + if family.starts_with("Noto") { + family = find_name(ttf, name_id::FULL_NAME)?; + } + typographic_family(&family).to_string() + }; + + let variant = { + let mut full = find_name(ttf, name_id::FULL_NAME).unwrap_or_default(); + full.make_ascii_lowercase(); + + // Some fonts miss the relevant bits for italic or oblique, so + // we also try to infer that from the full name. + let italic = ttf.is_italic() || full.contains("italic"); + let oblique = + ttf.is_oblique() || full.contains("oblique") || full.contains("slanted"); + + let style = match (italic, oblique) { + (false, false) => FontStyle::Normal, + (true, _) => FontStyle::Italic, + (_, true) => FontStyle::Oblique, + }; + + let weight = FontWeight::from_number(ttf.weight().to_number()); + let stretch = FontStretch::from_number(ttf.width().to_number()); + + FontVariant { style, weight, stretch } + }; + + // Determine the unicode coverage. + let mut codepoints = vec![]; + for subtable in ttf.tables().cmap.into_iter().flat_map(|table| table.subtables) { + if subtable.is_unicode() { + subtable.codepoints(|c| codepoints.push(c)); + } + } + + let mut flags = FontFlags::empty(); + flags.set(FontFlags::MONOSPACE, ttf.is_monospaced()); + + // Determine whether this is a serif or sans-serif font. + if let Some(panose) = ttf + .table_data(Tag::from_bytes(b"OS/2")) + .and_then(|os2| os2.get(32 .. 45)) + { + if matches!(panose, [2, 2 ..= 10, ..]) { + flags.insert(FontFlags::SERIF); + } + } + + Some(FontInfo { + family, + variant, + flags, + coverage: Coverage::from_vec(codepoints), + }) + } +} + +/// Try to find and decode the name with the given id. +pub(super) fn find_name(ttf: &ttf_parser::Face, name_id: u16) -> Option<String> { + ttf.names().into_iter().find_map(|entry| { + if entry.name_id == name_id { + if let Some(string) = entry.to_string() { + return Some(string); + } + + if entry.platform_id == PlatformId::Macintosh && entry.encoding_id == 0 { + return Some(decode_mac_roman(entry.name)); + } + } + + None + }) +} + +/// Decode mac roman encoded bytes into a string. +fn decode_mac_roman(coded: &[u8]) -> String { + #[rustfmt::skip] + const TABLE: [char; 128] = [ + 'Ä', 'Å', 'Ç', 'É', 'Ñ', 'Ö', 'Ü', 'á', 'à', 'â', 'ä', 'ã', 'å', 'ç', 'é', 'è', + 'ê', 'ë', 'í', 'ì', 'î', 'ï', 'ñ', 'ó', 'ò', 'ô', 'ö', 'õ', 'ú', 'ù', 'û', 'ü', + '†', '°', '¢', '£', '§', '•', '¶', 'ß', '®', '©', '™', '´', '¨', '≠', 'Æ', 'Ø', + '∞', '±', '≤', '≥', '¥', 'µ', '∂', '∑', '∏', 'π', '∫', 'ª', 'º', 'Ω', 'æ', 'ø', + '¿', '¡', '¬', '√', 'ƒ', '≈', '∆', '«', '»', '…', '\u{a0}', 'À', 'Ã', 'Õ', 'Œ', 'œ', + '–', '—', '“', '”', '‘', '’', '÷', '◊', 'ÿ', 'Ÿ', '⁄', '€', '‹', '›', 'fi', 'fl', + '‡', '·', '‚', '„', '‰', 'Â', 'Ê', 'Á', 'Ë', 'È', 'Í', 'Î', 'Ï', 'Ì', 'Ó', 'Ô', + '\u{f8ff}', 'Ò', 'Ú', 'Û', 'Ù', 'ı', 'ˆ', '˜', '¯', '˘', '˙', '˚', '¸', '˝', '˛', 'ˇ', + ]; + + fn char_from_mac_roman(code: u8) -> char { + if code < 128 { + code as char + } else { + TABLE[(code - 128) as usize] + } + } + + coded.iter().copied().map(char_from_mac_roman).collect() +} + +/// Trim style naming from a family name. +fn typographic_family(mut family: &str) -> &str { + // Separators between names, modifiers and styles. + const SEPARATORS: [char; 3] = [' ', '-', '_']; + + // Modifiers that can appear in combination with suffixes. + const MODIFIERS: &[&str] = &[ + "extra", "ext", "ex", "x", "semi", "sem", "sm", "demi", "dem", "ultra", + ]; + + // Style suffixes. + #[rustfmt::skip] + const SUFFIXES: &[&str] = &[ + "normal", "italic", "oblique", "slanted", + "thin", "th", "hairline", "light", "lt", "regular", "medium", "med", + "md", "bold", "bd", "demi", "extb", "black", "blk", "bk", "heavy", + "narrow", "condensed", "cond", "cn", "cd", "compressed", "expanded", "exp" + ]; + + // Trim spacing and weird leading dots in Apple fonts. + family = family.trim().trim_start_matches('.'); + + // Lowercase the string so that the suffixes match case-insensitively. + let lower = family.to_ascii_lowercase(); + let mut len = usize::MAX; + let mut trimmed = lower.as_str(); + + // Trim style suffixes repeatedly. + while trimmed.len() < len { + len = trimmed.len(); + + // Find style suffix. + let mut t = match SUFFIXES.iter().find_map(|s| trimmed.strip_suffix(s)) { + Some(t) => t, + None => break, + }; + + // Strip optional separator. + if let Some(s) = t.strip_suffix(SEPARATORS) { + trimmed = s; + t = s; + } + + // Also allow an extra modifier, but apply it only if it is separated it + // from the text before it (to prevent false positives). + if let Some(t) = MODIFIERS.iter().find_map(|s| t.strip_suffix(s)) { + if let Some(stripped) = t.strip_suffix(SEPARATORS) { + trimmed = stripped; + } + } + } + + &family[.. len] +} + +/// How many words the two strings share in their prefix. +fn shared_prefix_words(left: &str, right: &str) -> usize { + left.unicode_words() + .zip(right.unicode_words()) + .take_while(|(l, r)| l == r) + .count() +} + +/// A compactly encoded set of codepoints. +/// +/// The set is represented by alternating specifications of how many codepoints +/// are not in the set and how many are in the set. +/// +/// For example, for the set `{2, 3, 4, 9, 10, 11, 15, 18, 19}`, there are: +/// - 2 codepoints not inside (0, 1) +/// - 3 codepoints inside (2, 3, 4) +/// - 4 codepoints not inside (5, 6, 7, 8) +/// - 3 codepoints inside (9, 10, 11) +/// - 3 codepoints not inside (12, 13, 14) +/// - 1 codepoint inside (15) +/// - 2 codepoints not inside (16, 17) +/// - 2 codepoints inside (18, 19) +/// +/// So the resulting encoding is `[2, 3, 4, 3, 3, 1, 2, 2]`. +#[derive(Debug, Clone, Eq, PartialEq, Serialize, Deserialize)] +#[serde(transparent)] +pub struct Coverage(Vec<u32>); + +impl Coverage { + /// Encode a vector of codepoints. + pub fn from_vec(mut codepoints: Vec<u32>) -> Self { + codepoints.sort(); + codepoints.dedup(); + + let mut runs = Vec::new(); + let mut next = 0; + + for c in codepoints { + if let Some(run) = runs.last_mut().filter(|_| c == next) { + *run += 1; + } else { + runs.push(c - next); + runs.push(1); + } + + next = c + 1; + } + + Self(runs) + } + + /// Whether the codepoint is covered. + pub fn contains(&self, c: u32) -> bool { + let mut inside = false; + let mut cursor = 0; + + for &run in &self.0 { + if (cursor .. cursor + run).contains(&c) { + return inside; + } + cursor += run; + inside = !inside; + } + + false + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_trim_styles() { + assert_eq!(typographic_family("Atma Light"), "Atma"); + assert_eq!(typographic_family("eras bold"), "eras"); + assert_eq!(typographic_family("footlight mt light"), "footlight mt"); + assert_eq!(typographic_family("times new roman"), "times new roman"); + assert_eq!( + typographic_family("noto sans mono cond sembd"), + "noto sans mono" + ); + assert_eq!(typographic_family("noto serif SEMCOND sembd"), "noto serif"); + assert_eq!(typographic_family("crimson text"), "crimson text"); + assert_eq!(typographic_family("footlight light"), "footlight"); + assert_eq!(typographic_family("Noto Sans"), "Noto Sans"); + assert_eq!(typographic_family("Noto Sans Light"), "Noto Sans"); + assert_eq!( + typographic_family("Noto Sans Semicondensed Heavy"), + "Noto Sans" + ); + assert_eq!(typographic_family("Familx"), "Familx"); + assert_eq!(typographic_family("Font Ultra"), "Font Ultra"); + assert_eq!(typographic_family("Font Ultra Bold"), "Font"); + } + + #[test] + fn test_coverage() { + #[track_caller] + fn test(set: &[u32], runs: &[u32]) { + let coverage = Coverage::from_vec(set.to_vec()); + assert_eq!(coverage.0, runs); + + let max = 5 + set.iter().copied().max().unwrap_or_default(); + for c in 0 .. max { + assert_eq!(set.contains(&c), coverage.contains(c)); + } + } + + test(&[], &[]); + test(&[0], &[0, 1]); + test(&[1], &[1, 1]); + test(&[0, 1], &[0, 2]); + test(&[0, 1, 3], &[0, 2, 1, 1]); + test( + // {2, 3, 4, 9, 10, 11, 15, 18, 19} + &[18, 19, 2, 4, 9, 11, 15, 3, 3, 10], + &[2, 3, 4, 3, 3, 1, 2, 2], + ) + } +} |
