diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 7a197a9d4aa8..2a48276707ce 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -64,8 +64,10 @@ jobs: python-version: '3.12' # https://github.com/matplotlib/matplotlib/issues/29844 pygobject-ver: '<3.52.0' - - os: ubuntu-22.04 + - name-suffix: "(Extra TeX packages)" + os: ubuntu-22.04 python-version: '3.13' + extra-packages: 'texlive-fonts-extra texlive-lang-cyrillic' # https://github.com/matplotlib/matplotlib/issues/29844 pygobject-ver: '<3.52.0' - name-suffix: "Free-threaded" @@ -142,7 +144,8 @@ jobs: texlive-latex-recommended \ texlive-luatex \ texlive-pictures \ - texlive-xetex + texlive-xetex \ + ${{ matrix.extra-packages }} if [[ "${{ matrix.name-suffix }}" != '(Minimum Versions)' ]]; then sudo apt-get install -yy --no-install-recommends ffmpeg poppler-utils fi diff --git a/doc/users/next_whats_new/type1_subset.rst b/doc/users/next_whats_new/type1_subset.rst new file mode 100644 index 000000000000..b0ab0a4337e6 --- /dev/null +++ b/doc/users/next_whats_new/type1_subset.rst @@ -0,0 +1,9 @@ +PDF files created with usetex now embed subsets of Type 1 fonts +--------------------------------------------------------------- + +When using the PDF backend with the usetex feature, +Matplotlib calls TeX to render the text and formulas in the figure. +The fonts that get used are usually "Type 1" fonts. +They used to be embedded in full +but are now limited to the glyphs that are actually used in the figure. +This reduces the size of the resulting PDF files. diff --git a/lib/matplotlib/_type1font.py b/lib/matplotlib/_type1font.py index 032b6a42ea63..33b22adbae73 100644 --- a/lib/matplotlib/_type1font.py +++ b/lib/matplotlib/_type1font.py @@ -3,7 +3,7 @@ This version reads pfa and pfb files and splits them for embedding in pdf files. It also supports SlantFont and ExtendFont transformations, -similarly to pdfTeX and friends. There is no support yet for subsetting. +similarly to pdfTeX and friends. Usage:: @@ -11,6 +11,7 @@ clear_part, encrypted_part, finale = font.parts slanted_font = font.transform({'slant': 0.167}) extended_font = font.transform({'extend': 1.2}) + subset_font = font.subset([ord(c) for c in 'Hello World']) Sources: @@ -25,6 +26,7 @@ import binascii import functools +import itertools import logging import re import string @@ -637,8 +639,7 @@ def _parse_subrs(self, tokens, _data): return array, next(tokens).endpos() - @staticmethod - def _parse_charstrings(tokens, _data): + def _parse_charstrings(self, tokens, _data): count_token = next(tokens) if not count_token.is_number(): raise RuntimeError( @@ -660,7 +661,12 @@ def _parse_charstrings(tokens, _data): f"Token following /{glyphname} in CharStrings definition " f"must be a number, was {nbytes_token}" ) - next(tokens) # usually RD or |- + token = next(tokens) + if not token.is_keyword(self._abbr['RD']): + raise RuntimeError( + f"Token preceding charstring must be {self._abbr['RD']}, " + f"was {token}" + ) binary_token = tokens.send(1+nbytes_token.value()) charstrings[glyphname] = binary_token.value() @@ -691,8 +697,7 @@ def _parse_encoding(tokens, _data): continue encoding[index_token.value()] = name_token.value() - @staticmethod - def _parse_othersubrs(tokens, data): + def _parse_othersubrs(self, tokens, data): init_pos = None while True: token = next(tokens) @@ -700,7 +705,7 @@ def _parse_othersubrs(tokens, data): init_pos = token.pos if token.is_delim(): _expression(token, tokens, data) - elif token.is_keyword('def', 'ND', '|-'): + elif token.is_keyword('def', self._abbr['ND']): return data[init_pos:token.endpos()], token.endpos() def transform(self, effects): @@ -755,7 +760,7 @@ def transform(self, effects): fontmatrix = ( f"[{' '.join(_format_approx(x, 6) for x in array)}]" ) - replacements = ( + newparts = self._replace( [(x, f'/FontName/{fontname} def') for x in self._pos['FontName']] + [(x, f'/ItalicAngle {italicangle} def') @@ -765,11 +770,63 @@ def transform(self, effects): + [(x, '') for x in self._pos.get('UniqueID', [])] ) + return Type1Font(( + newparts[0], + self._encrypt(newparts[1], 'eexec'), + self.parts[2] + )) + + def with_encoding(self, encoding): + """ + Change the encoding of the font. + + Parameters + ---------- + encoding : dict + A dictionary mapping character codes to glyph names. + + Returns + ------- + `Type1Font` + """ + newparts = self._replace( + [(x, '') for x in self._pos.get('UniqueID', [])] + + [(self._pos['Encoding'][0], self._postscript_encoding(encoding))] + ) + return Type1Font(( + newparts[0], + self._encrypt(newparts[1], 'eexec'), + self.parts[2] + )) + + def _replace(self, replacements): + """ + Change the font according to `replacements` + + Parameters + ---------- + replacements : list of ((int, int), str) + Each element is ((pos0, pos1), replacement) where pos0 and + pos1 are indices to the original font data (parts[0] and the + decrypted part concatenated). The data in the interval + pos0:pos1 will be replaced by the replacement text. To + accommodate binary data, the replacement is taken to be in + Latin-1 encoding. + + The case where pos0 is inside parts[0] and pos1 inside + the decrypted part is not supported. + + Returns + ------- + (bytes, bytes) + The new parts[0] and decrypted part (which needs to be + encrypted in the transformed font). + """ data = bytearray(self.parts[0]) data.extend(self.decrypted) len0 = len(self.parts[0]) for (pos0, pos1), value in sorted(replacements, reverse=True): - data[pos0:pos1] = value.encode('ascii', 'replace') + data[pos0:pos1] = value.encode('latin-1') if pos0 < len(self.parts[0]): if pos1 >= len(self.parts[0]): raise RuntimeError( @@ -778,13 +835,275 @@ def transform(self, effects): ) len0 += len(value) - pos1 + pos0 - data = bytes(data) - return Type1Font(( - data[:len0], - self._encrypt(data[len0:], 'eexec'), + return bytes(data[:len0]), bytes(data[len0:]) + + def subset(self, characters, name_prefix): + """ + Return a new font that only defines the given characters. + + Parameters + ---------- + characters : sequence of bytes + The subset of characters to include. These are indices into the + font's encoding array. The encoding array of a Type-1 font can + only include 256 characters, but other glyphs may be accessed + via the seac operator. + name_prefix : str + Prefix to prepend to the font name. + + Returns + ------- + `Type1Font` + """ + characters = frozenset(characters) + if _log.isEnabledFor(logging.DEBUG): + _log.debug( + "Subsetting font %s to characters %s = %s", + self.prop['FontName'], + sorted(characters), + [self.prop['Encoding'].get(code) for code in sorted(characters)], + ) + encoding = {code: glyph + for code, glyph in self.prop['Encoding'].items() + if code in characters} + encoding[0] = '.notdef' + # todo and done include strings (glyph names) + todo = set(encoding.values()) + done = set() + seen_subrs = {0, 1, 2, 3} + while todo: + glyph = todo.pop() + called_glyphs, called_subrs = _CharstringSimulator(self).run(glyph) + todo.update(called_glyphs - done) + seen_subrs.update(called_subrs) + done.add(glyph) + + charstrings = self._subset_charstrings(done) + subrs = self._subset_subrs(seen_subrs) + newparts = self._replace( + [(x, f'/FontName /{name_prefix}{self.prop["FontName"]} def') + for x in self._pos['FontName']] + + [(self._pos['CharStrings'][0], charstrings), + (self._pos['Subrs'][0], subrs), + (self._pos['Encoding'][0], self._postscript_encoding(encoding)) + ] + [(x, '') for x in self._pos.get('UniqueID', [])] + ) + return type(self)(( + newparts[0], + self._encrypt(newparts[1], 'eexec'), self.parts[2] )) + @staticmethod + def _charstring_tokens(data): + """Parse a Type-1 charstring + + Yield opcode names and integer parameters. + """ + data = iter(data) + for byte in data: + if 32 <= byte <= 246: + yield byte - 139 + elif 247 <= byte <= 250: + byte2 = next(data) + yield (byte-247) * 256 + byte2 + 108 + elif 251 <= byte <= 254: + byte2 = next(data) + yield -(byte-251)*256 - byte2 - 108 + elif byte == 255: + bs = bytes(itertools.islice(data, 4)) + yield struct.unpack('>i', bs)[0] + elif byte == 12: + byte1 = next(data) + yield { + 0: 'dotsection', + 1: 'vstem3', + 2: 'hstem3', + 6: 'seac', + 7: 'sbw', + 12: 'div', + 16: 'callothersubr', + 17: 'pop', + 33: 'setcurrentpoint' + }[byte1] + else: + yield { + 1: 'hstem', + 3: 'vstem', + 4: 'vmoveto', + 5: 'rlineto', + 6: 'hlineto', + 7: 'vlineto', + 8: 'rrcurveto', + 9: 'closepath', + 10: 'callsubr', + 11: 'return', + 13: 'hsbw', + 14: 'endchar', + 21: 'rmoveto', + 22: 'hmoveto', + 30: 'vhcurveto', + 31: 'hvcurveto' + }[byte] + + def _postscript_encoding(self, encoding): + """Return a PostScript encoding array for the encoding.""" + return '\n'.join([ + '/Encoding 256 array\n0 1 255 { 1 index exch /.notdef put} for', + *( + f'dup {i} /{glyph} put' + for i, glyph in sorted(encoding.items()) + if glyph != '.notdef' + ), + 'readonly def\n', + ]) + + def _subset_charstrings(self, glyphs): + """Return a PostScript CharStrings array for the glyphs.""" + charstrings = self.prop['CharStrings'] + lenIV = self.prop.get('lenIV', 4) + ordered = sorted(glyphs) + encrypted = [ + self._encrypt(charstrings[glyph], 'charstring', lenIV).decode('latin-1') + for glyph in ordered + ] + RD, ND = self._abbr['RD'], self._abbr['ND'] + return '\n'.join([ + f'/CharStrings {len(ordered)} dict dup begin', + *( + f'/{glyph} {len(enc)} {RD} {enc} {ND}' + for glyph, enc in zip(ordered, encrypted) + ), + 'end\n', + ]) + + def _subset_subrs(self, indices): + """Return a PostScript Subrs array for the subroutines.""" + # we can't remove subroutines, we just replace unused ones with a stub + subrs = self.prop['Subrs'] + n_subrs = len(subrs) + lenIV = self.prop.get('lenIV', 4) + stub = self._encrypt(b'\x0b', 'charstring', lenIV).decode('latin-1') + encrypted = [ + self._encrypt(subrs[i], 'charstring', lenIV).decode('latin-1') + if i in indices + else stub + for i in range(n_subrs) + ] + RD, ND, NP = self._abbr['RD'], self._abbr['ND'], self._abbr['NP'] + return '\n'.join([ + f'/Subrs {n_subrs} array', + *( + f'dup {i} {len(enc)} {RD} {enc} {NP}' + for i, enc in enumerate(encrypted) + ), + ]) + + +class _CharstringSimulator: + __slots__ = ('font', 'buildchar_stack', 'postscript_stack', 'glyphs', 'subrs') + + def __init__(self, font): + self.font = font + self.buildchar_stack = [] + self.postscript_stack = [] + self.glyphs = set() + self.subrs = set() + + def run(self, glyph_or_subr): + """Run the charstring interpreter on a glyph or subroutine. + + This does not actually execute the code but simulates it to find out + which subroutines get called when executing the glyph or subroutine. + + Parameters + ---------- + glyph_or_subr : str or int + The name of the glyph or the index of the subroutine to simulate. + + Returns + ------- + glyphs : set[str] + The set of glyph names called by the glyph or subroutine. + subrs : set[int] + The set of subroutines called by the glyph or subroutine. + """ + if isinstance(glyph_or_subr, str): + program = self.font.prop['CharStrings'][glyph_or_subr] + self.glyphs.add(glyph_or_subr) + else: + program = self.font.prop['Subrs'][glyph_or_subr] + self.subrs.add(glyph_or_subr) + for opcode in self.font._charstring_tokens(program): + if opcode in ('return', 'endchar'): + return self.glyphs, self.subrs + self._step(opcode) + else: + font_name = self.font.prop.get('FontName', '(unknown)') + _log.info( + f"Glyph or subr {glyph_or_subr} in font {font_name} does not end " + "with return or endchar" + ) + return self.glyphs, self.subrs + + def _step(self, opcode): + """Run one step in the charstring interpreter.""" + match opcode: + case int(): + self.buildchar_stack.append(opcode) + case ( + 'hsbw' | 'sbw' | 'closepath' | 'hlineto' | 'hmoveto' | 'hcurveto' | + 'hvcurveto' | 'rlineto' | 'rmoveto' | 'rrcurveto' | 'vhcurveto' | + 'vlineto' | 'vmoveto' | 'dotsection' | 'hstem' | 'hstem3' | + 'vstem' | 'vstem3' | 'setcurrentpoint' + ): + self.buildchar_stack.clear() + case 'seac': # Standard Encoding Accented Character + codes = self.buildchar_stack[3:5] + self.glyphs.update(_StandardEncoding[int(x)] for x in codes) + self.buildchar_stack.clear() + case 'div': + num1, num2 = self.buildchar_stack[-2:] + if num2 == 0: + _log.warning( + f"Division by zero in font {self.font.prop['FontName']}" + ) + self.buildchar_stack[-2:] = [0] + else: + self.buildchar_stack[-2:] = [num1/num2] + case 'callothersubr': + n, othersubr = self.buildchar_stack[-2:] + if not isinstance(n, int): + _log.warning( + f"callothersubr {othersubr} with non-integer argument " + f"count in font {self.font.prop['FontName']}" + ) + n = int(n) + args = self.buildchar_stack[-2-n:-2] + if othersubr == 3: + self.postscript_stack.append(args[0]) + else: + self.postscript_stack.extend(args[::-1]) + self.buildchar_stack[-2-n:] = [] + case 'callsubr': + subr = self.buildchar_stack.pop() + if not isinstance(subr, int): + _log.warning( + f"callsubr with non-integer argument {subr} in font " + f"{self.font.prop['FontName']}" + ) + subr = int(subr) + self.run(subr) + case 'pop': + if not self.postscript_stack: + _log.warning( + f"pop with empty stack in font {self.font.prop['FontName']}" + ) + self.postscript_stack.append(0) + self.buildchar_stack.append(self.postscript_stack.pop()) + case _: + raise RuntimeError(f'opcode {opcode}') + _StandardEncoding = { **{ord(letter): letter for letter in string.ascii_letters}, diff --git a/lib/matplotlib/backends/backend_pdf.py b/lib/matplotlib/backends/backend_pdf.py index 073ca05bc172..6f3cb8afa1ac 100644 --- a/lib/matplotlib/backends/backend_pdf.py +++ b/lib/matplotlib/backends/backend_pdf.py @@ -722,8 +722,6 @@ def __init__(self, filename, metadata=None): self._internal_font_seq = (Name(f'F{i}') for i in itertools.count(1)) self._fontNames = {} # maps filenames to internal font names self._dviFontInfo = {} # maps dvi font names to embedding information - # differently encoded Type-1 fonts may share the same descriptor - self._type1Descriptors = {} self._character_tracker = _backend_pdf_ps.CharacterTracker() self.alphaStates = {} # maps alpha values to graphics state objects @@ -767,8 +765,7 @@ def __init__(self, filename, metadata=None): fontNames = _api.deprecated("3.11")(property(lambda self: self._fontNames)) dviFontInfo = _api.deprecated("3.11")(property(lambda self: self._dviFontInfo)) - type1Descriptors = _api.deprecated("3.11")( - property(lambda self: self._type1Descriptors)) + type1Descriptors = _api.deprecated("3.11")(property(lambda _: {})) def newPage(self, width, height): self.endStream() @@ -808,7 +805,14 @@ def newTextnote(self, text, positionRect=[-100, -100, 0, 0]): } self.pageAnnotations.append(theNote) - def _get_subsetted_psname(self, ps_name, charmap): + @staticmethod + def _get_subset_prefix(charset): + """ + Get a prefix for a subsetted font name. + + The prefix is six uppercase letters followed by a plus sign; + see PDF reference section 5.5.3 Font Subsets. + """ def toStr(n, base): if n < base: return string.ascii_uppercase[n] @@ -818,11 +822,15 @@ def toStr(n, base): ) # encode to string using base 26 - hashed = hash(frozenset(charmap.keys())) % ((sys.maxsize + 1) * 2) + hashed = hash(charset) % ((sys.maxsize + 1) * 2) prefix = toStr(hashed, 26) # get first 6 characters from prefix - return prefix[:6] + "+" + ps_name + return prefix[:6] + "+" + + @staticmethod + def _get_subsetted_psname(ps_name, charmap): + return PdfFile._get_subset_prefix(frozenset(charmap.keys())) + ps_name def finalize(self): """Write out the various deferred objects and the pdf end matter.""" @@ -994,53 +1002,60 @@ def _embedTeXFont(self, fontinfo): _log.debug('Embedding TeX font %s - fontinfo=%s', fontinfo.dvifont.texname, fontinfo.__dict__) - # Widths - widthsObject = self.reserveObject('font widths') - tfm = fontinfo.dvifont._tfm - # convert from TeX's 12.20 representation to 1/1000 text space units. - widths = [(1000 * metrics.tex_width) >> 20 - if (metrics := tfm.get_metrics(char)) else 0 - for char in range(max(tfm._glyph_metrics, default=-1) + 1)] - self.writeObject(widthsObject, widths) - - # Font dictionary + # The font dictionary is the top-level object describing a font fontdictObject = self.reserveObject('font dictionary') fontdict = { 'Type': Name('Font'), 'Subtype': Name('Type1'), - 'FirstChar': 0, - 'LastChar': len(widths) - 1, - 'Widths': widthsObject, - } - - # Encoding (if needed) - if fontinfo.encodingfile is not None: - fontdict['Encoding'] = { - 'Type': Name('Encoding'), - 'Differences': [ - 0, *map(Name, dviread._parse_enc(fontinfo.encodingfile))], - } + } - # We have a font file to embed - read it in and apply any effects + # Read the font file and apply any encoding changes and effects t1font = _type1font.Type1Font(fontinfo.fontfile) + if fontinfo.encodingfile is not None: + t1font = t1font.with_encoding( + {i: c for i, c in enumerate(dviread._parse_enc(fontinfo.encodingfile))} + ) if fontinfo.effects: t1font = t1font.transform(fontinfo.effects) - fontdict['BaseFont'] = Name(t1font.prop['FontName']) - # Font descriptors may be shared between differently encoded - # Type-1 fonts, so only create a new descriptor if there is no - # existing descriptor for this font. - effects = (fontinfo.effects.get('slant', 0.0), - fontinfo.effects.get('extend', 1.0)) - fontdesc = self._type1Descriptors.get((fontinfo.fontfile, effects)) - if fontdesc is None: - fontdesc = self.createType1Descriptor(t1font) - self._type1Descriptors[(fontinfo.fontfile, effects)] = fontdesc - fontdict['FontDescriptor'] = fontdesc + # Reduce the font to only the glyphs used in the document, get the encoding + # for that subset, and compute various properties based on the encoding. + chars = frozenset(self._character_tracker.used[fontinfo.dvifont.fname]) + t1font = t1font.subset(chars, self._get_subset_prefix(chars)) + fontdict['BaseFont'] = Name(t1font.prop['FontName']) + # createType1Descriptor writes the font data as a side effect + fontdict['FontDescriptor'] = self.createType1Descriptor(t1font) + encoding = t1font.prop['Encoding'] + fontdict['Encoding'] = self._generate_encoding(encoding) + fc = fontdict['FirstChar'] = min(encoding.keys(), default=0) + lc = fontdict['LastChar'] = max(encoding.keys(), default=255) + + # Convert glyph widths from TeX 12.20 fixed point to 1/1000 text space units + tfm = fontinfo.dvifont._tfm + widths = [(1000 * metrics.tex_width) >> 20 + if (metrics := tfm.get_metrics(char)) else 0 + for char in range(fc, lc + 1)] + fontdict['Widths'] = widthsObject = self.reserveObject('glyph widths') + self.writeObject(widthsObject, widths) self.writeObject(fontdictObject, fontdict) return fontdictObject + + def _generate_encoding(self, encoding): + prev = -2 + result = [] + for code, name in sorted(encoding.items()): + if code != prev + 1: + result.append(code) + prev = code + result.append(Name(name)) + return { + 'Type': Name('Encoding'), + 'Differences': result + } + + @_api.delete_parameter("3.11", "fontfile") def createType1Descriptor(self, t1font, fontfile=None): # Create and write the font descriptor and the font file @@ -1077,6 +1092,14 @@ def createType1Descriptor(self, t1font, fontfile=None): if 0: flags |= 1 << 18 + encoding = t1font.prop['Encoding'] + charset = ''.join( + sorted( + f'/{c}' for c in encoding.values() + if c != '.notdef' + ) + ) + descriptor = { 'Type': Name('FontDescriptor'), 'FontName': Name(t1font.prop['FontName']), @@ -1090,6 +1113,7 @@ def createType1Descriptor(self, t1font, fontfile=None): 'FontFile': fontfileObject, 'FontFamily': t1font.prop['FamilyName'], 'StemV': 50, # TODO + 'CharSet': charset, # (see also revision 3874; but not all TeX distros have AFM files!) # 'FontWeight': a number where 400 = Regular, 700 = Bold } @@ -2267,6 +2291,7 @@ def draw_tex(self, gc, x, y, s, prop, angle, *, mtext=None): seq += [['font', pdfname, dvifont.size]] oldfont = dvifont seq += [['text', x1, y1, [bytes([glyph])], x1+width]] + self.file._character_tracker.track(dvifont, chr(glyph)) # Find consecutive text strings with constant y coordinate and # combine into a sequence of strings and kerns, or just one diff --git a/lib/matplotlib/dviread.py b/lib/matplotlib/dviread.py index a588979f5fad..9e8b6a5facf5 100644 --- a/lib/matplotlib/dviread.py +++ b/lib/matplotlib/dviread.py @@ -17,17 +17,17 @@ ... """ -from collections import namedtuple import dataclasses import enum -from functools import cache, lru_cache, partial, wraps import logging import os -from pathlib import Path import re import struct import subprocess import sys +from collections import namedtuple +from functools import cache, lru_cache, partial, wraps +from pathlib import Path import numpy as np @@ -583,6 +583,9 @@ class DviFont: Attributes ---------- texname : bytes + fname : str + Compatibility shim so that DviFont can be used with + ``_backend_pdf_ps.CharacterTracker``; not a real filename. size : float Size of the font in Adobe points, converted from the slightly smaller TeX points. @@ -602,6 +605,18 @@ def __init__(self, scale, tfm, texname, vf): (1000 * self._tfm.width.get(char, 0)) >> 20 for char in range(max(self._tfm.width, default=-1) + 1)])) + @property + def fname(self): + """A fake filename""" + return self.texname.decode('latin-1') + + def _get_fontmap(self, string): + """Get the mapping from characters to the font that includes them. + + Each value maps to self; there is no fallback mechanism for DviFont. + """ + return {char: self for char in string} + def __eq__(self, other): return (type(self) is type(other) and self.texname == other.texname and self.size == other.size) @@ -1161,8 +1176,8 @@ def _fontfile(cls, suffix, texname): if __name__ == '__main__': - from argparse import ArgumentParser import itertools + from argparse import ArgumentParser import fontTools.agl diff --git a/lib/matplotlib/dviread.pyi b/lib/matplotlib/dviread.pyi index 41799c083218..12a9215b5308 100644 --- a/lib/matplotlib/dviread.pyi +++ b/lib/matplotlib/dviread.pyi @@ -66,6 +66,8 @@ class DviFont: def __ne__(self, other: object) -> bool: ... @property def widths(self) -> list[int]: ... + @property + def fname(self) -> str: ... class Vf(Dvi): def __init__(self, filename: str | os.PathLike) -> None: ... diff --git a/lib/matplotlib/tests/baseline_images/test_backend_pdf/font-bitstream-charter.pdf b/lib/matplotlib/tests/baseline_images/test_backend_pdf/font-bitstream-charter.pdf new file mode 100644 index 000000000000..c8f9411fb3d9 Binary files /dev/null and b/lib/matplotlib/tests/baseline_images/test_backend_pdf/font-bitstream-charter.pdf differ diff --git a/lib/matplotlib/tests/baseline_images/test_backend_pdf/font-dejavusans.pdf b/lib/matplotlib/tests/baseline_images/test_backend_pdf/font-dejavusans.pdf new file mode 100644 index 000000000000..fd907dee6687 Binary files /dev/null and b/lib/matplotlib/tests/baseline_images/test_backend_pdf/font-dejavusans.pdf differ diff --git a/lib/matplotlib/tests/baseline_images/test_backend_pdf/font-heuristica.pdf b/lib/matplotlib/tests/baseline_images/test_backend_pdf/font-heuristica.pdf new file mode 100644 index 000000000000..ca9b38d09b89 Binary files /dev/null and b/lib/matplotlib/tests/baseline_images/test_backend_pdf/font-heuristica.pdf differ diff --git a/lib/matplotlib/tests/test_backend_pdf.py b/lib/matplotlib/tests/test_backend_pdf.py index dc349e8dfa35..f126fb543e78 100644 --- a/lib/matplotlib/tests/test_backend_pdf.py +++ b/lib/matplotlib/tests/test_backend_pdf.py @@ -16,7 +16,7 @@ from matplotlib.backends._backend_pdf_ps import get_glyphs_subset, font_as_file from matplotlib.backends.backend_pdf import PdfPages from matplotlib.patches import Rectangle -from matplotlib.testing import _gen_multi_font_text +from matplotlib.testing import _gen_multi_font_text, _has_tex_package from matplotlib.testing.decorators import check_figures_equal, image_comparison from matplotlib.testing._markers import needs_usetex @@ -428,3 +428,53 @@ def test_truetype_conversion(recwarn): font=Path(__file__).parent / "data/mpltest.ttf", fontsize=80) ax.set_xticks([]) ax.set_yticks([]) + + +@pytest.mark.skipif(not _has_tex_package("heuristica"), + reason="LaTeX lacks heuristica package") +@image_comparison(["font-heuristica.pdf"]) +def test_font_heuristica(): + # Heuristica uses the callothersubr operator for some glyphs + mpl.rcParams['text.latex.preamble'] = '\n'.join(( + r'\usepackage{heuristica}', + r'\usepackage[T1]{fontenc}', + r'\usepackage[utf8]{inputenc}' + )) + fig, ax = plt.subplots() + ax.text(0.1, 0.1, r"BHTem fi ffl 1234", usetex=True, fontsize=50) + ax.set_xticks([]) + ax.set_yticks([]) + + +@pytest.mark.skipif(not _has_tex_package("DejaVuSans"), + reason="LaTeX lacks DejaVuSans package") +@image_comparison(["font-dejavusans.pdf"]) +def test_font_dejavusans(): + # DejaVuSans uses the seac operator to compose characters with diacritics + mpl.rcParams['text.latex.preamble'] = '\n'.join(( + r'\usepackage{DejaVuSans}', + r'\usepackage[T1]{fontenc}', + r'\usepackage[utf8]{inputenc}' + )) + + fig, ax = plt.subplots() + ax.text(0.1, 0.1, r"\textsf{ñäö ABCDabcd}", usetex=True, fontsize=50) + ax.text(0.1, 0.3, r"\textsf{fi ffl 1234}", usetex=True, fontsize=50) + ax.set_xticks([]) + ax.set_yticks([]) + + +@pytest.mark.skipif(not _has_tex_package("charter"), + reason="LaTeX lacks charter package") +@image_comparison(["font-bitstream-charter.pdf"]) +def test_font_bitstream_charter(): + mpl.rcParams['text.latex.preamble'] = '\n'.join(( + r'\usepackage{charter}', + r'\usepackage[T1]{fontenc}', + r'\usepackage[utf8]{inputenc}' + )) + fig, ax = plt.subplots() + ax.text(0.1, 0.1, r"åüš ABCDabcd", usetex=True, fontsize=50) + ax.text(0.1, 0.3, r"fi ffl 1234", usetex=True, fontsize=50) + ax.set_xticks([]) + ax.set_yticks([]) diff --git a/lib/matplotlib/tests/test_usetex.py b/lib/matplotlib/tests/test_usetex.py index 0b6d6d5e5535..95eb69325622 100644 --- a/lib/matplotlib/tests/test_usetex.py +++ b/lib/matplotlib/tests/test_usetex.py @@ -1,3 +1,4 @@ +import re from tempfile import TemporaryFile import numpy as np @@ -156,6 +157,69 @@ def test_missing_psfont(fmt, monkeypatch): fig.savefig(tmpfile, format=fmt) +def test_pdf_type1_font_subsetting(): + """Test that fonts in PDF output are properly subset.""" + pikepdf = pytest.importorskip("pikepdf") + + mpl.rcParams["text.usetex"] = True + mpl.rcParams["text.latex.preamble"] = r"\usepackage{amssymb}" + fig, ax = plt.subplots() + ax.text(0.2, 0.7, r"$\int_{-\infty}^{\aleph}\sqrt{\alpha\beta\gamma}\mathrm{d}x$") + ax.text(0.2, 0.5, r"$\mathfrak{x}\circledcirc\mathfrak{y}\in\mathbb{R}$") + + with TemporaryFile() as tmpfile: + fig.savefig(tmpfile, format="pdf") + tmpfile.seek(0) + pdf = pikepdf.Pdf.open(tmpfile) + + length = {} + page = pdf.pages[0] + for font_name, font in page.Resources.Font.items(): + assert font.Subtype == "/Type1", ( + f"Font {font_name}={font} is not a Type 1 font" + ) + + # Subsetted font names have a 6-character tag followed by a '+' + base_font = str(font["/BaseFont"]).removeprefix("/") + assert re.match(r"^[A-Z]{6}\+", base_font), ( + f"Font {font_name}={base_font} lacks a subset indicator tag" + ) + assert "/FontFile" in font.FontDescriptor, ( + f"Type 1 font {font_name}={base_font} is not embedded" + ) + _, original_name = base_font.split("+", 1) + length[original_name] = len(bytes(font["/FontDescriptor"]["/FontFile"])) + + print("Embedded font stream lengths:", length) + # We should have several fonts, each much smaller than the original. + # I get under 10kB on my system for each font, but allow 15kB in case + # of differences in the font files. + assert { + 'CMEX10', + 'CMMI12', + 'CMR12', + 'CMSY10', + 'CMSY8', + 'EUFM10', + 'MSAM10', + 'MSBM10', + }.issubset(length), "Missing expected fonts in the PDF" + for font_name, length in length.items(): + assert length < 15_000, ( + f"Font {font_name}={length} is larger than expected" + ) + + # For comparison, lengths without subsetting on my system: + # 'CMEX10': 29686 + # 'CMMI12': 36176 + # 'CMR12': 32157 + # 'CMSY10': 32004 + # 'CMSY8': 32061 + # 'EUFM10': 20546 + # 'MSAM10': 31199 + # 'MSBM10': 34129 + + try: _old_gs_version = mpl._get_executable_info('gs').version < parse_version('9.55') except mpl.ExecutableNotFoundError:
Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.
Alternative Proxies: