"""Font subsetting utilities for reducing embedded font file sizes."""
import html
import io
import logging
import re
import xml.etree.ElementTree as ET
from fontTools import subset
from fontTools.ttLib import TTFont
logger = logging.getLogger(__name__)
[docs]
def subset_font(
input_path: str,
output_format: str,
unicode_codepoints: set[int],
) -> bytes:
"""Subset a font file to include only specified Unicode codepoints.
This function uses fontTools (pyftsubset) to create a minimal font file
containing only the glyphs needed for the specified codepoints.
Args:
input_path: Path to input font file (TTF/OTF).
output_format: Output format - "ttf", "otf", or "woff2".
unicode_codepoints: Set of Unicode codepoints (integers) to include in
the subset.
Returns:
Subset font file as bytes.
Raises:
ImportError: If fonttools package is not installed.
Exception: If subsetting fails (invalid font, I/O error, etc.).
Example:
>>> codepoints = {0x41, 0x42, 0x43, 0x3042} # A, B, C, あ
>>> font_bytes = subset_font("/usr/share/fonts/arial.ttf", "woff2", codepoints)
>>> len(font_bytes) # Much smaller than original
8432
"""
if output_format not in ("ttf", "otf", "woff2"):
raise ValueError(
f"Unsupported font format: {output_format}. "
f"Supported formats: ttf, otf, woff2"
)
if not unicode_codepoints:
logger.warning(
"No Unicode codepoints provided for subsetting, using all glyphs"
)
# Convert to sorted list for fontTools
unicodes = sorted(unicode_codepoints)
logger.debug(
f"Subsetting font: {input_path} -> {output_format} "
f"({len(unicode_codepoints)} codepoint(s))"
)
try:
# Load the font
font = TTFont(input_path)
# Create subsetter with options
subsetter = subset.Subsetter()
# Subset options
options = subset.Options()
options.drop_tables = [] # Keep all tables by default
options.layout_features = ["*"] # Preserve all OpenType features
options.name_IDs = ["*"] # Preserve all name table entries
options.name_languages = ["*"] # Preserve all languages
options.notdef_outline = True # Keep .notdef glyph
options.glyph_names = True # Preserve glyph names (helps debugging)
subsetter.options = options
# Populate subset with Unicode characters
if unicodes:
subsetter.populate(unicodes=unicodes)
else:
# If no characters specified, include all glyphs
subsetter.populate(glyphs=font.getGlyphOrder())
# Perform subsetting
subsetter.subset(font)
# Save to bytes (using a temporary in-memory approach)
output_buffer = io.BytesIO()
# Set flavor for WOFF2 on the font object before saving
if output_format == "woff2":
font.flavor = "woff2"
font.save(output_buffer)
font_bytes = output_buffer.getvalue()
logger.debug(
f"Subsetting complete: {len(font_bytes)} bytes "
f"(~{len(font_bytes) / 1024:.1f} KB)"
)
return font_bytes
except Exception as e:
logger.error(f"Font subsetting failed for {input_path}: {e}")
raise
def _get_local_tag_name(element: ET.Element) -> str:
"""Extract local tag name from element (handles namespaces).
Args:
element: XML element.
Returns:
Local tag name without namespace prefix.
Example:
>>> elem.tag = "{http://www.w3.org/2000/svg}text"
>>> _get_local_tag_name(elem)
"text"
"""
tag = element.tag
if "}" in tag:
return tag.split("}", 1)[1]
return tag
def _extract_font_family(element: ET.Element) -> str | None:
"""Extract font-family from element, returning LAST font in fallback chain.
For fallback chains like "Arial", "DejaVu Sans", returns "DejaVu Sans"
(the actually embedded font for subsetting).
Args:
element: XML element (text or tspan).
Returns:
Font family name (last in chain), or None if not found.
Example:
>>> _extract_font_family(
... <text style="font-family: 'Arial', 'DejaVu Sans'; ...">
... )
"DejaVu Sans"
>>> _extract_font_family(<text font-family="'Helvetica', 'Arial'">)
"Arial"
"""
# Try style attribute first
style = element.get("style", "")
if style:
match = re.search(r"font-family:\s*([^;]+)", style)
if match:
font_family_value = match.group(1).strip()
# Parse comma-separated list of fonts
families = [f.strip().strip("'\"") for f in font_family_value.split(",")]
# Return last font (the embedded one in fallback chain)
return families[-1] if families else None
# Try direct font-family attribute
font_family = element.get("font-family")
if font_family:
# Parse comma-separated list
families = [f.strip().strip("'\"") for f in font_family.split(",")]
# Return last font (the embedded one in fallback chain)
return families[-1] if families else None
return None
def _extract_font_family_with_inheritance(
element: ET.Element, parent_map: dict[ET.Element, ET.Element]
) -> str | None:
"""Extract font-family from element or inherited from parent.
Walks up the element tree to find the first font-family declaration.
This handles cases where <tspan> elements inherit font-family from
their parent <text> element.
Args:
element: XML element (text or tspan).
parent_map: Dictionary mapping child elements to their parents.
Returns:
Font family name, or None if not found in element or ancestors.
Example:
>>> <text font-family="Arial"><tspan>Hello</tspan></text>
>>> _extract_font_family_with_inheritance(tspan_element, parent_map)
"Arial"
"""
# Try current element first
font_family = _extract_font_family(element)
if font_family:
return font_family
# Walk up parent tree to find inherited font-family
current = element
while current in parent_map:
current = parent_map[current]
font_family = _extract_font_family(current)
if font_family:
return font_family
return None
def _extract_text_content(element: ET.Element) -> str:
"""Extract and decode all text content from element (including entities).
Args:
element: XML element.
Returns:
Decoded text content with XML entities resolved and control characters filtered.
Note:
- Handles numeric character references (一, 中)
- Handles named entities (<, >, &, etc.)
- Recursively includes text from child elements
- Filters out control characters (codepoints 0-31) which are not rendered in SVG
"""
# Collect all text (element.text and tail from all descendants)
text_parts = []
if element.text:
text_parts.append(element.text)
for child in element:
# Recursively get text from children
child_text = _extract_text_content(child)
if child_text:
text_parts.append(child_text)
# Also include tail text (text after child element)
if child.tail:
text_parts.append(child.tail)
raw_text = "".join(text_parts)
# Decode HTML/XML entities
decoded_text = html.unescape(raw_text)
# Filter out control characters (C0: 0-31, DEL: 127, C1: 128-159)
# These are not rendered in SVG text and cause incorrect font matching
# (e.g., newline causes Arial to be substituted with LastResort on macOS)
decoded_text = "".join(
char
for char in decoded_text
if ord(char) >= 32 and not (127 <= ord(char) <= 159)
)
return decoded_text
def _extract_direct_text_content(element: ET.Element) -> str:
"""Extract text content directly owned by this element (not children).
This function extracts only the text that belongs to the current element,
not text from child elements. This is important for font subsetting because
child elements may have different font-family attributes.
Args:
element: XML element.
Returns:
Decoded text content (only element.text, not children).
Example:
<text>Hello<tspan>World</tspan></text>
Returns: "Hello" (not "HelloWorld")
"""
if element.text:
# Decode HTML/XML entities
return html.unescape(element.text)
return ""
[docs]
def get_font_usage_from_svg(svg_tree: ET.Element) -> dict[str, set[str]]:
"""Get font usage information from SVG for subsetting.
This is a convenience wrapper around extract_used_unicode() that
logs appropriate messages about font usage.
Args:
svg_tree: Root SVG element to analyze.
Returns:
Dictionary mapping font-family names to sets of Unicode characters.
"""
font_usage = extract_used_unicode(svg_tree)
logger.debug(
f"Extracted {len(font_usage)} font(s) with "
f"{sum(len(chars) for chars in font_usage.values())} unique char(s)"
)
return font_usage
def _chars_to_unicode_list(chars: set[str]) -> list[int]:
"""Convert set of characters to list of Unicode code points.
Args:
chars: Set of Unicode characters (e.g., {"A", "あ", "中"}).
Returns:
List of Unicode code points (e.g., [0x41, 0x3042, 0x4E2D]).
Note:
- Handles multi-codepoint characters (emoji, combining marks)
- Returns unique code points sorted for deterministic output
"""
codepoints = set()
for char in chars:
# Each character may contain multiple code points (e.g., emoji with modifiers)
for code_point in char:
codepoints.add(ord(code_point))
return sorted(codepoints)