From c77a17cd9268ff9cb63d00905cdfbd5b5dea4319 Mon Sep 17 00:00:00 2001 From: icanhasmath Date: Thu, 18 Jun 2026 11:21:48 -0500 Subject: [PATCH 1/2] Fix CVE-2026-28804: make ASCIIHexDecode decoding linear MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ASCIIHexDecode.decode accumulated output one character at a time (retval += chr(...), hex_pair += char) which is O(n^2); a large /ASCIIHexDecode stream caused excessive CPU time (CWE-407). Locate the EOD marker once, strip whitespace, and bulk-decode via binascii.unhexlify. Also handle a trailing odd hex digit per ISO 32000 §7.4.2 (assumed followed by "0") instead of the previous AssertionError. Mirrors upstream pypdf 6.7.5 (PR #3666). Co-Authored-By: Claude Opus 4.8 (1M context) --- PyPDF2/filters.py | 38 +++++++++++++++++----------------- Tests/test_security_lzw_hex.py | 31 +++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 19 deletions(-) create mode 100644 Tests/test_security_lzw_hex.py diff --git a/PyPDF2/filters.py b/PyPDF2/filters.py index 207a952811..4dccc8d40e 100644 --- a/PyPDF2/filters.py +++ b/PyPDF2/filters.py @@ -50,6 +50,7 @@ else: from io import StringIO +import binascii import struct import zlib @@ -169,25 +170,24 @@ def decode(data, decodeParms=None): :return: a string conversion in base-7 ASCII, where each of its values v is such that 0 <= ord(v) <= 127. """ - retval = "" - hex_pair = "" - index = 0 - while True: - if index >= len(data): - raise PdfStreamError("Unexpected EOD in ASCIIHexDecode") - char = data[index] - if char == ">": - break - elif char.isspace(): - index += 1 - continue - hex_pair += char - if len(hex_pair) == 2: - retval += chr(int(hex_pair, base=16)) - hex_pair = "" - index += 1 - assert hex_pair == "" - return retval + # CVE-2026-28804: the previous character-by-character accumulation + # (retval += ..., hex_pair += ...) is quadratic, so a large + # /ASCIIHexDecode stream caused excessive CPU time. Locate the EOD + # marker once, strip whitespace, and bulk-decode with binascii. + eod = data.find(">") + if eod == -1: + raise PdfStreamError("Unexpected EOD in ASCIIHexDecode") + hex_str = b"".join(data[:eod].split()) if isinstance( + data, bytes + ) else "".join(data[:eod].split()) + # Per ISO 32000 §7.4.2, a final odd hex digit is assumed to be + # followed by a "0". + if len(hex_str) % 2 == 1: + hex_str += b"0" if isinstance(hex_str, bytes) else "0" + try: + return binascii.unhexlify(hex_str) + except (binascii.Error, TypeError): + raise PdfStreamError("Invalid hexadecimal data in ASCIIHexDecode") class LZWDecode(object): diff --git a/Tests/test_security_lzw_hex.py b/Tests/test_security_lzw_hex.py new file mode 100644 index 0000000000..a46db3faf2 --- /dev/null +++ b/Tests/test_security_lzw_hex.py @@ -0,0 +1,31 @@ +# -*- coding: utf-8 -*- +""" +Regression tests for the LZW and ASCIIHex decoder hardening backports: + +- CVE-2026-28804 : ASCIIHexDecode quadratic decoding -> bulk decode. +- CVE-2025-62708/66019 : bound LZWDecode output (decompression bomb). +""" +import pytest + +from PyPDF2 import filters +from PyPDF2.errors import PdfReadError, PdfStreamError + + +# --- CVE-2026-28804: ASCIIHexDecode --------------------------------------- + +def test_asciihex_basic(): + assert filters.ASCIIHexDecode.decode("48656c6c6f>") == b"Hello" + + +def test_asciihex_ignores_whitespace(): + assert filters.ASCIIHexDecode.decode("48 65 6c\n6c\t6f >") == b"Hello" + + +def test_asciihex_odd_length_padded(): + # ISO 32000 §7.4.2: a trailing odd digit is treated as followed by "0". + assert filters.ASCIIHexDecode.decode("4>") == b"@" # 0x40 + + +def test_asciihex_missing_eod_raises(): + with pytest.raises(PdfStreamError): + filters.ASCIIHexDecode.decode("48656c6c6f") # no '>' From bbb064e208774688cd163c9a29536c76b83d1f21 Mon Sep 17 00:00:00 2001 From: icanhasmath Date: Thu, 18 Jun 2026 11:23:20 -0500 Subject: [PATCH 2/2] Fix CVE-2025-62708, CVE-2025-66019: bound LZWDecode output A crafted LZWDecode stream could amplify into gigabytes of output with no limit, exhausting memory (CWE-770). 1.28.6's LZWDecode.Decoder.decode accumulated into `baos` in an unbounded loop. Add LZW_MAX_OUTPUT_LENGTH (75 MB) and a per-iteration check in Decoder.decode that raises PdfReadError once the output exceeds it. The internal Decoder gains a defaulted max_output_length kwarg; the public LZWDecode.decode signature is unchanged. Upstream addressed this in pypdf 6.1.3 (PR #3502, output cap) and tightened the default to 75 MB in 6.4.0 (PR, CVE-2025-66019). 1.28.6 has no LzwCodec / LimitReachedError, so this is a hand-written cap on the old decoder reusing PdfReadError. Co-Authored-By: Claude Opus 4.8 (1M context) --- PyPDF2/filters.py | 13 ++++++++++++- Tests/test_security_lzw_hex.py | 26 ++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/PyPDF2/filters.py b/PyPDF2/filters.py index 4dccc8d40e..8f64ecaf13 100644 --- a/PyPDF2/filters.py +++ b/PyPDF2/filters.py @@ -190,16 +190,22 @@ def decode(data, decodeParms=None): raise PdfStreamError("Invalid hexadecimal data in ASCIIHexDecode") +# CVE-2025-62708 / CVE-2025-66019: bound LZWDecode output so a small stream +# cannot amplify into gigabytes of memory. Set to 0 to disable (trusted input). +LZW_MAX_OUTPUT_LENGTH = 75000000 # 75 MB + + class LZWDecode(object): """Taken from: http://www.java2s.com/Open-Source/Java-Document/PDF/PDF-Renderer/com/sun/pdfview/decode/LZWDecode.java.htm """ class Decoder(object): - def __init__(self, data): + def __init__(self, data, max_output_length=LZW_MAX_OUTPUT_LENGTH): self.STOP = 257 self.CLEARDICT = 256 self.data = data + self.max_output_length = max_output_length self.bytepos = 0 self.bitpos = 0 self.dict = [""] * 4096 @@ -246,6 +252,11 @@ def decode(self): cW = self.CLEARDICT baos = "" while True: + if self.max_output_length and len(baos) > self.max_output_length: + raise PdfReadError( + "Output exceeds maximum allowed length (%d bytes) " + "while decoding LZW stream." % self.max_output_length + ) pW = cW cW = self.next_code() if cW == -1: diff --git a/Tests/test_security_lzw_hex.py b/Tests/test_security_lzw_hex.py index a46db3faf2..78235f955a 100644 --- a/Tests/test_security_lzw_hex.py +++ b/Tests/test_security_lzw_hex.py @@ -29,3 +29,29 @@ def test_asciihex_odd_length_padded(): def test_asciihex_missing_eod_raises(): with pytest.raises(PdfStreamError): filters.ASCIIHexDecode.decode("48656c6c6f") # no '>' + + +# --- CVE-2025-62708 / CVE-2025-66019: LZWDecode output cap ----------------- + +def _pack_lzw(codes, width=9): + """Pack a list of fixed-width LZW codes MSB-first into bytes. + + Kept small so the code width stays at the initial 9 bits (dictlen < 511). + """ + bits = "".join(format(c, "0%db" % width) for c in codes) + while len(bits) % 8: + bits += "0" + return bytes(bytearray(int(bits[i : i + 8], 2) for i in range(0, len(bits), 8))) + + +def test_lzw_decodes_normally(): + # Three literal 'A' (65) codes then STOP (257) -> "AAA". + data = _pack_lzw([65, 65, 65, 257]) + assert filters.LZWDecode.Decoder(data).decode() == b"AAA" + + +def test_lzw_output_is_capped(): + # Many literal codes with no STOP; a tiny cap must abort before exhaustion. + data = _pack_lzw([65] * 200) + with pytest.raises(PdfReadError): + filters.LZWDecode.Decoder(data, max_output_length=5).decode()