Source code for psd2svg.font_subsetting

"""Font subsetting utilities for reducing embedded font file sizes."""

import html
import io
import logging
import re
import xml.etree.ElementTree as ET

from fontTools import subset
from fontTools.ttLib import TTFont

logger = logging.getLogger(__name__)


[docs] def extract_used_unicode(svg_tree: ET.Element) -> dict[str, set[str]]: """Extract Unicode characters per font-family from SVG text elements. This function analyzes all <text>, <tspan>, and XHTML text elements (p, span from foreignObject) in the SVG tree to determine which Unicode characters are used by each font family. Args: svg_tree: Root SVG element to analyze. Returns: Dictionary mapping font-family names to sets of Unicode characters. Example: {"Arial": {"A", "B", "C"}, "Noto Sans JP": {"あ", "い"}} Note: - Handles nested <tspan> elements and XHTML elements - Decodes XML entities (e.g., &lt;, &#x4E00;) - Extracts font-family from style attributes - Returns empty dict if no text elements found """ font_usage: dict[str, set[str]] = {} # Build parent map for inheritance lookup parent_map = {c: p for p in svg_tree.iter() for c in p} # Find all text, tspan, and XHTML text elements (p, span from foreignObject) for element in svg_tree.iter(): tag_name = _get_local_tag_name(element) if tag_name not in ("text", "tspan", "p", "span"): continue # Extract font-family from element or parent font_family = _extract_font_family_with_inheritance(element, parent_map) if not font_family: continue # Extract text content for this element only (not children) text_content = _extract_direct_text_content(element) if not text_content: continue # Add characters to the set for this font if font_family not in font_usage: font_usage[font_family] = set() font_usage[font_family].update(text_content) logger.debug( f"Extracted Unicode usage: {len(font_usage)} font(s), " f"{sum(len(chars) for chars in font_usage.values())} unique char(s) total" ) return font_usage
[docs] def subset_font( input_path: str, output_format: str, unicode_codepoints: set[int], ) -> bytes: """Subset a font file to include only specified Unicode codepoints. This function uses fontTools (pyftsubset) to create a minimal font file containing only the glyphs needed for the specified codepoints. Args: input_path: Path to input font file (TTF/OTF). output_format: Output format - "ttf", "otf", or "woff2". unicode_codepoints: Set of Unicode codepoints (integers) to include in the subset. Returns: Subset font file as bytes. Raises: ImportError: If fonttools package is not installed. Exception: If subsetting fails (invalid font, I/O error, etc.). Example: >>> codepoints = {0x41, 0x42, 0x43, 0x3042} # A, B, C, あ >>> font_bytes = subset_font("/usr/share/fonts/arial.ttf", "woff2", codepoints) >>> len(font_bytes) # Much smaller than original 8432 """ if output_format not in ("ttf", "otf", "woff2"): raise ValueError( f"Unsupported font format: {output_format}. " f"Supported formats: ttf, otf, woff2" ) if not unicode_codepoints: logger.warning( "No Unicode codepoints provided for subsetting, using all glyphs" ) # Convert to sorted list for fontTools unicodes = sorted(unicode_codepoints) logger.debug( f"Subsetting font: {input_path} -> {output_format} " f"({len(unicode_codepoints)} codepoint(s))" ) try: # Load the font font = TTFont(input_path) # Create subsetter with options subsetter = subset.Subsetter() # Subset options options = subset.Options() options.drop_tables = [] # Keep all tables by default options.layout_features = ["*"] # Preserve all OpenType features options.name_IDs = ["*"] # Preserve all name table entries options.name_languages = ["*"] # Preserve all languages options.notdef_outline = True # Keep .notdef glyph options.glyph_names = True # Preserve glyph names (helps debugging) subsetter.options = options # Populate subset with Unicode characters if unicodes: subsetter.populate(unicodes=unicodes) else: # If no characters specified, include all glyphs subsetter.populate(glyphs=font.getGlyphOrder()) # Perform subsetting subsetter.subset(font) # Save to bytes (using a temporary in-memory approach) output_buffer = io.BytesIO() # Set flavor for WOFF2 on the font object before saving if output_format == "woff2": font.flavor = "woff2" font.save(output_buffer) font_bytes = output_buffer.getvalue() logger.debug( f"Subsetting complete: {len(font_bytes)} bytes " f"(~{len(font_bytes) / 1024:.1f} KB)" ) return font_bytes except Exception as e: logger.error(f"Font subsetting failed for {input_path}: {e}") raise
def _get_local_tag_name(element: ET.Element) -> str: """Extract local tag name from element (handles namespaces). Args: element: XML element. Returns: Local tag name without namespace prefix. Example: >>> elem.tag = "{http://www.w3.org/2000/svg}text" >>> _get_local_tag_name(elem) "text" """ tag = element.tag if "}" in tag: return tag.split("}", 1)[1] return tag def _extract_font_family(element: ET.Element) -> str | None: """Extract font-family from element, returning LAST font in fallback chain. For fallback chains like "Arial", "DejaVu Sans", returns "DejaVu Sans" (the actually embedded font for subsetting). Args: element: XML element (text or tspan). Returns: Font family name (last in chain), or None if not found. Example: >>> _extract_font_family( ... <text style="font-family: 'Arial', 'DejaVu Sans'; ..."> ... ) "DejaVu Sans" >>> _extract_font_family(<text font-family="'Helvetica', 'Arial'">) "Arial" """ # Try style attribute first style = element.get("style", "") if style: match = re.search(r"font-family:\s*([^;]+)", style) if match: font_family_value = match.group(1).strip() # Parse comma-separated list of fonts families = [f.strip().strip("'\"") for f in font_family_value.split(",")] # Return last font (the embedded one in fallback chain) return families[-1] if families else None # Try direct font-family attribute font_family = element.get("font-family") if font_family: # Parse comma-separated list families = [f.strip().strip("'\"") for f in font_family.split(",")] # Return last font (the embedded one in fallback chain) return families[-1] if families else None return None def _extract_font_family_with_inheritance( element: ET.Element, parent_map: dict[ET.Element, ET.Element] ) -> str | None: """Extract font-family from element or inherited from parent. Walks up the element tree to find the first font-family declaration. This handles cases where <tspan> elements inherit font-family from their parent <text> element. Args: element: XML element (text or tspan). parent_map: Dictionary mapping child elements to their parents. Returns: Font family name, or None if not found in element or ancestors. Example: >>> <text font-family="Arial"><tspan>Hello</tspan></text> >>> _extract_font_family_with_inheritance(tspan_element, parent_map) "Arial" """ # Try current element first font_family = _extract_font_family(element) if font_family: return font_family # Walk up parent tree to find inherited font-family current = element while current in parent_map: current = parent_map[current] font_family = _extract_font_family(current) if font_family: return font_family return None def _extract_text_content(element: ET.Element) -> str: """Extract and decode all text content from element (including entities). Args: element: XML element. Returns: Decoded text content with XML entities resolved and control characters filtered. Note: - Handles numeric character references (&#x4E00;, &#20013;) - Handles named entities (&lt;, &gt;, &amp;, etc.) - Recursively includes text from child elements - Filters out control characters (codepoints 0-31) which are not rendered in SVG """ # Collect all text (element.text and tail from all descendants) text_parts = [] if element.text: text_parts.append(element.text) for child in element: # Recursively get text from children child_text = _extract_text_content(child) if child_text: text_parts.append(child_text) # Also include tail text (text after child element) if child.tail: text_parts.append(child.tail) raw_text = "".join(text_parts) # Decode HTML/XML entities decoded_text = html.unescape(raw_text) # Filter out control characters (C0: 0-31, DEL: 127, C1: 128-159) # These are not rendered in SVG text and cause incorrect font matching # (e.g., newline causes Arial to be substituted with LastResort on macOS) decoded_text = "".join( char for char in decoded_text if ord(char) >= 32 and not (127 <= ord(char) <= 159) ) return decoded_text def _extract_direct_text_content(element: ET.Element) -> str: """Extract text content directly owned by this element (not children). This function extracts only the text that belongs to the current element, not text from child elements. This is important for font subsetting because child elements may have different font-family attributes. Args: element: XML element. Returns: Decoded text content (only element.text, not children). Example: <text>Hello<tspan>World</tspan></text> Returns: "Hello" (not "HelloWorld") """ if element.text: # Decode HTML/XML entities return html.unescape(element.text) return ""
[docs] def get_font_usage_from_svg(svg_tree: ET.Element) -> dict[str, set[str]]: """Get font usage information from SVG for subsetting. This is a convenience wrapper around extract_used_unicode() that logs appropriate messages about font usage. Args: svg_tree: Root SVG element to analyze. Returns: Dictionary mapping font-family names to sets of Unicode characters. """ font_usage = extract_used_unicode(svg_tree) logger.debug( f"Extracted {len(font_usage)} font(s) with " f"{sum(len(chars) for chars in font_usage.values())} unique char(s)" ) return font_usage
def _chars_to_unicode_list(chars: set[str]) -> list[int]: """Convert set of characters to list of Unicode code points. Args: chars: Set of Unicode characters (e.g., {"A", "あ", "中"}). Returns: List of Unicode code points (e.g., [0x41, 0x3042, 0x4E2D]). Note: - Handles multi-codepoint characters (emoji, combining marks) - Returns unique code points sorted for deterministic output """ codepoints = set() for char in chars: # Each character may contain multiple code points (e.g., emoji with modifiers) for code_point in char: codepoints.add(ord(code_point)) return sorted(codepoints)