diff --git a/PyPDF2/filters.py b/PyPDF2/filters.py index 207a952811..8f64ecaf13 100644 --- a/PyPDF2/filters.py +++ b/PyPDF2/filters.py @@ -50,6 +50,7 @@ else: from io import StringIO +import binascii import struct import zlib @@ -169,25 +170,29 @@ def decode(data, decodeParms=None): :return: a string conversion in base-7 ASCII, where each of its values v is such that 0 <= ord(v) <= 127. """ - retval = "" - hex_pair = "" - index = 0 - while True: - if index >= len(data): - raise PdfStreamError("Unexpected EOD in ASCIIHexDecode") - char = data[index] - if char == ">": - break - elif char.isspace(): - index += 1 - continue - hex_pair += char - if len(hex_pair) == 2: - retval += chr(int(hex_pair, base=16)) - hex_pair = "" - index += 1 - assert hex_pair == "" - return retval + # CVE-2026-28804: the previous character-by-character accumulation + # (retval += ..., hex_pair += ...) is quadratic, so a large + # /ASCIIHexDecode stream caused excessive CPU time. Locate the EOD + # marker once, strip whitespace, and bulk-decode with binascii. + eod = data.find(">") + if eod == -1: + raise PdfStreamError("Unexpected EOD in ASCIIHexDecode") + hex_str = b"".join(data[:eod].split()) if isinstance( + data, bytes + ) else "".join(data[:eod].split()) + # Per ISO 32000 §7.4.2, a final odd hex digit is assumed to be + # followed by a "0". + if len(hex_str) % 2 == 1: + hex_str += b"0" if isinstance(hex_str, bytes) else "0" + try: + return binascii.unhexlify(hex_str) + except (binascii.Error, TypeError): + raise PdfStreamError("Invalid hexadecimal data in ASCIIHexDecode") + + +# CVE-2025-62708 / CVE-2025-66019: bound LZWDecode output so a small stream +# cannot amplify into gigabytes of memory. Set to 0 to disable (trusted input). +LZW_MAX_OUTPUT_LENGTH = 75000000 # 75 MB class LZWDecode(object): @@ -196,10 +201,11 @@ class LZWDecode(object): """ class Decoder(object): - def __init__(self, data): + def __init__(self, data, max_output_length=LZW_MAX_OUTPUT_LENGTH): self.STOP = 257 self.CLEARDICT = 256 self.data = data + self.max_output_length = max_output_length self.bytepos = 0 self.bitpos = 0 self.dict = [""] * 4096 @@ -246,6 +252,11 @@ def decode(self): cW = self.CLEARDICT baos = "" while True: + if self.max_output_length and len(baos) > self.max_output_length: + raise PdfReadError( + "Output exceeds maximum allowed length (%d bytes) " + "while decoding LZW stream." % self.max_output_length + ) pW = cW cW = self.next_code() if cW == -1: diff --git a/Tests/test_security_lzw_hex.py b/Tests/test_security_lzw_hex.py new file mode 100644 index 0000000000..78235f955a --- /dev/null +++ b/Tests/test_security_lzw_hex.py @@ -0,0 +1,57 @@ +# -*- coding: utf-8 -*- +""" +Regression tests for the LZW and ASCIIHex decoder hardening backports: + +- CVE-2026-28804 : ASCIIHexDecode quadratic decoding -> bulk decode. +- CVE-2025-62708/66019 : bound LZWDecode output (decompression bomb). +""" +import pytest + +from PyPDF2 import filters +from PyPDF2.errors import PdfReadError, PdfStreamError + + +# --- CVE-2026-28804: ASCIIHexDecode --------------------------------------- + +def test_asciihex_basic(): + assert filters.ASCIIHexDecode.decode("48656c6c6f>") == b"Hello" + + +def test_asciihex_ignores_whitespace(): + assert filters.ASCIIHexDecode.decode("48 65 6c\n6c\t6f >") == b"Hello" + + +def test_asciihex_odd_length_padded(): + # ISO 32000 §7.4.2: a trailing odd digit is treated as followed by "0". + assert filters.ASCIIHexDecode.decode("4>") == b"@" # 0x40 + + +def test_asciihex_missing_eod_raises(): + with pytest.raises(PdfStreamError): + filters.ASCIIHexDecode.decode("48656c6c6f") # no '>' + + +# --- CVE-2025-62708 / CVE-2025-66019: LZWDecode output cap ----------------- + +def _pack_lzw(codes, width=9): + """Pack a list of fixed-width LZW codes MSB-first into bytes. + + Kept small so the code width stays at the initial 9 bits (dictlen < 511). + """ + bits = "".join(format(c, "0%db" % width) for c in codes) + while len(bits) % 8: + bits += "0" + return bytes(bytearray(int(bits[i : i + 8], 2) for i in range(0, len(bits), 8))) + + +def test_lzw_decodes_normally(): + # Three literal 'A' (65) codes then STOP (257) -> "AAA". + data = _pack_lzw([65, 65, 65, 257]) + assert filters.LZWDecode.Decoder(data).decode() == b"AAA" + + +def test_lzw_output_is_capped(): + # Many literal codes with no STOP; a tiny cap must abort before exhaustion. + data = _pack_lzw([65] * 200) + with pytest.raises(PdfReadError): + filters.LZWDecode.Decoder(data, max_output_length=5).decode()