From c77a17cd9268ff9cb63d00905cdfbd5b5dea4319 Mon Sep 17 00:00:00 2001
From: icanhasmath <marcg@activestate.com>
Date: Thu, 18 Jun 2026 11:21:48 -0500
Subject: [PATCH 1/2] Fix CVE-2026-28804: make ASCIIHexDecode decoding linear
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ASCIIHexDecode.decode accumulated output one character at a time
(retval += chr(...), hex_pair += char) which is O(n^2); a large
/ASCIIHexDecode stream caused excessive CPU time (CWE-407).

Locate the EOD marker once, strip whitespace, and bulk-decode via
binascii.unhexlify. Also handle a trailing odd hex digit per ISO 32000
§7.4.2 (assumed followed by "0") instead of the previous AssertionError.
Mirrors upstream pypdf 6.7.5 (PR #3666).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 PyPDF2/filters.py              | 38 +++++++++++++++++-----------------
 Tests/test_security_lzw_hex.py | 31 +++++++++++++++++++++++++++
 2 files changed, 50 insertions(+), 19 deletions(-)
 create mode 100644 Tests/test_security_lzw_hex.py

diff --git a/PyPDF2/filters.py b/PyPDF2/filters.py
index 207a952811..4dccc8d40e 100644
--- a/PyPDF2/filters.py
+++ b/PyPDF2/filters.py
@@ -50,6 +50,7 @@
 else:
     from io import StringIO
 
+import binascii
 import struct
 import zlib
 
@@ -169,25 +170,24 @@ def decode(data, decodeParms=None):
         :return: a string conversion in base-7 ASCII, where each of its values
             v is such that 0 <= ord(v) <= 127.
         """
-        retval = ""
-        hex_pair = ""
-        index = 0
-        while True:
-            if index >= len(data):
-                raise PdfStreamError("Unexpected EOD in ASCIIHexDecode")
-            char = data[index]
-            if char == ">":
-                break
-            elif char.isspace():
-                index += 1
-                continue
-            hex_pair += char
-            if len(hex_pair) == 2:
-                retval += chr(int(hex_pair, base=16))
-                hex_pair = ""
-            index += 1
-        assert hex_pair == ""
-        return retval
+        # CVE-2026-28804: the previous character-by-character accumulation
+        # (retval += ..., hex_pair += ...) is quadratic, so a large
+        # /ASCIIHexDecode stream caused excessive CPU time. Locate the EOD
+        # marker once, strip whitespace, and bulk-decode with binascii.
+        eod = data.find(">")
+        if eod == -1:
+            raise PdfStreamError("Unexpected EOD in ASCIIHexDecode")
+        hex_str = b"".join(data[:eod].split()) if isinstance(
+            data, bytes
+        ) else "".join(data[:eod].split())
+        # Per ISO 32000 §7.4.2, a final odd hex digit is assumed to be
+        # followed by a "0".
+        if len(hex_str) % 2 == 1:
+            hex_str += b"0" if isinstance(hex_str, bytes) else "0"
+        try:
+            return binascii.unhexlify(hex_str)
+        except (binascii.Error, TypeError):
+            raise PdfStreamError("Invalid hexadecimal data in ASCIIHexDecode")
 
 
 class LZWDecode(object):
diff --git a/Tests/test_security_lzw_hex.py b/Tests/test_security_lzw_hex.py
new file mode 100644
index 0000000000..a46db3faf2
--- /dev/null
+++ b/Tests/test_security_lzw_hex.py
@@ -0,0 +1,31 @@
+# -*- coding: utf-8 -*-
+"""
+Regression tests for the LZW and ASCIIHex decoder hardening backports:
+
+- CVE-2026-28804          : ASCIIHexDecode quadratic decoding -> bulk decode.
+- CVE-2025-62708/66019    : bound LZWDecode output (decompression bomb).
+"""
+import pytest
+
+from PyPDF2 import filters
+from PyPDF2.errors import PdfReadError, PdfStreamError
+
+
+# --- CVE-2026-28804: ASCIIHexDecode ---------------------------------------
+
+def test_asciihex_basic():
+    assert filters.ASCIIHexDecode.decode("48656c6c6f>") == b"Hello"
+
+
+def test_asciihex_ignores_whitespace():
+    assert filters.ASCIIHexDecode.decode("48 65 6c\n6c\t6f >") == b"Hello"
+
+
+def test_asciihex_odd_length_padded():
+    # ISO 32000 §7.4.2: a trailing odd digit is treated as followed by "0".
+    assert filters.ASCIIHexDecode.decode("4>") == b"@"  # 0x40
+
+
+def test_asciihex_missing_eod_raises():
+    with pytest.raises(PdfStreamError):
+        filters.ASCIIHexDecode.decode("48656c6c6f")  # no '>'

From bbb064e208774688cd163c9a29536c76b83d1f21 Mon Sep 17 00:00:00 2001
From: icanhasmath <marcg@activestate.com>
Date: Thu, 18 Jun 2026 11:23:20 -0500
Subject: [PATCH 2/2] Fix CVE-2025-62708, CVE-2025-66019: bound LZWDecode
 output

A crafted LZWDecode stream could amplify into gigabytes of output with no
limit, exhausting memory (CWE-770). 1.28.6's LZWDecode.Decoder.decode
accumulated into `baos` in an unbounded loop.

Add LZW_MAX_OUTPUT_LENGTH (75 MB) and a per-iteration check in
Decoder.decode that raises PdfReadError once the output exceeds it. The
internal Decoder gains a defaulted max_output_length kwarg; the public
LZWDecode.decode signature is unchanged.

Upstream addressed this in pypdf 6.1.3 (PR #3502, output cap) and tightened
the default to 75 MB in 6.4.0 (PR, CVE-2025-66019). 1.28.6 has no LzwCodec
/ LimitReachedError, so this is a hand-written cap on the old decoder
reusing PdfReadError.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 PyPDF2/filters.py              | 13 ++++++++++++-
 Tests/test_security_lzw_hex.py | 26 ++++++++++++++++++++++++++
 2 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/PyPDF2/filters.py b/PyPDF2/filters.py
index 4dccc8d40e..8f64ecaf13 100644
--- a/PyPDF2/filters.py
+++ b/PyPDF2/filters.py
@@ -190,16 +190,22 @@ def decode(data, decodeParms=None):
             raise PdfStreamError("Invalid hexadecimal data in ASCIIHexDecode")
 
 
+# CVE-2025-62708 / CVE-2025-66019: bound LZWDecode output so a small stream
+# cannot amplify into gigabytes of memory. Set to 0 to disable (trusted input).
+LZW_MAX_OUTPUT_LENGTH = 75000000  # 75 MB
+
+
 class LZWDecode(object):
     """Taken from:
     http://www.java2s.com/Open-Source/Java-Document/PDF/PDF-Renderer/com/sun/pdfview/decode/LZWDecode.java.htm
     """
 
     class Decoder(object):
-        def __init__(self, data):
+        def __init__(self, data, max_output_length=LZW_MAX_OUTPUT_LENGTH):
             self.STOP = 257
             self.CLEARDICT = 256
             self.data = data
+            self.max_output_length = max_output_length
             self.bytepos = 0
             self.bitpos = 0
             self.dict = [""] * 4096
@@ -246,6 +252,11 @@ def decode(self):
             cW = self.CLEARDICT
             baos = ""
             while True:
+                if self.max_output_length and len(baos) > self.max_output_length:
+                    raise PdfReadError(
+                        "Output exceeds maximum allowed length (%d bytes) "
+                        "while decoding LZW stream." % self.max_output_length
+                    )
                 pW = cW
                 cW = self.next_code()
                 if cW == -1:
diff --git a/Tests/test_security_lzw_hex.py b/Tests/test_security_lzw_hex.py
index a46db3faf2..78235f955a 100644
--- a/Tests/test_security_lzw_hex.py
+++ b/Tests/test_security_lzw_hex.py
@@ -29,3 +29,29 @@ def test_asciihex_odd_length_padded():
 def test_asciihex_missing_eod_raises():
     with pytest.raises(PdfStreamError):
         filters.ASCIIHexDecode.decode("48656c6c6f")  # no '>'
+
+
+# --- CVE-2025-62708 / CVE-2025-66019: LZWDecode output cap -----------------
+
+def _pack_lzw(codes, width=9):
+    """Pack a list of fixed-width LZW codes MSB-first into bytes.
+
+    Kept small so the code width stays at the initial 9 bits (dictlen < 511).
+    """
+    bits = "".join(format(c, "0%db" % width) for c in codes)
+    while len(bits) % 8:
+        bits += "0"
+    return bytes(bytearray(int(bits[i : i + 8], 2) for i in range(0, len(bits), 8)))
+
+
+def test_lzw_decodes_normally():
+    # Three literal 'A' (65) codes then STOP (257) -> "AAA".
+    data = _pack_lzw([65, 65, 65, 257])
+    assert filters.LZWDecode.Decoder(data).decode() == b"AAA"
+
+
+def test_lzw_output_is_capped():
+    # Many literal codes with no STOP; a tiny cap must abort before exhaustion.
+    data = _pack_lzw([65] * 200)
+    with pytest.raises(PdfReadError):
+        filters.LZWDecode.Decoder(data, max_output_length=5).decode()