Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 31 additions & 20 deletions PyPDF2/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
else:
from io import StringIO

import binascii
import struct
import zlib

Expand Down Expand Up @@ -169,25 +170,29 @@ def decode(data, decodeParms=None):
:return: a string conversion in base-7 ASCII, where each of its values
v is such that 0 <= ord(v) <= 127.
"""
retval = ""
hex_pair = ""
index = 0
while True:
if index >= len(data):
raise PdfStreamError("Unexpected EOD in ASCIIHexDecode")
char = data[index]
if char == ">":
break
elif char.isspace():
index += 1
continue
hex_pair += char
if len(hex_pair) == 2:
retval += chr(int(hex_pair, base=16))
hex_pair = ""
index += 1
assert hex_pair == ""
return retval
# CVE-2026-28804: the previous character-by-character accumulation
# (retval += ..., hex_pair += ...) is quadratic, so a large
# /ASCIIHexDecode stream caused excessive CPU time. Locate the EOD
# marker once, strip whitespace, and bulk-decode with binascii.
eod = data.find(">")
if eod == -1:
raise PdfStreamError("Unexpected EOD in ASCIIHexDecode")
hex_str = b"".join(data[:eod].split()) if isinstance(
data, bytes
) else "".join(data[:eod].split())
# Per ISO 32000 §7.4.2, a final odd hex digit is assumed to be
# followed by a "0".
if len(hex_str) % 2 == 1:
hex_str += b"0" if isinstance(hex_str, bytes) else "0"
try:
return binascii.unhexlify(hex_str)
except (binascii.Error, TypeError):
raise PdfStreamError("Invalid hexadecimal data in ASCIIHexDecode")
Comment thread
icanhasmath marked this conversation as resolved.


# CVE-2025-62708 / CVE-2025-66019: bound LZWDecode output so a small stream
# cannot amplify into gigabytes of memory. Set to 0 to disable (trusted input).
LZW_MAX_OUTPUT_LENGTH = 75000000 # 75 MB


class LZWDecode(object):
Expand All @@ -196,10 +201,11 @@ class LZWDecode(object):
"""

class Decoder(object):
def __init__(self, data):
def __init__(self, data, max_output_length=LZW_MAX_OUTPUT_LENGTH):
self.STOP = 257
self.CLEARDICT = 256
self.data = data
self.max_output_length = max_output_length
self.bytepos = 0
self.bitpos = 0
self.dict = [""] * 4096
Expand Down Expand Up @@ -246,6 +252,11 @@ def decode(self):
cW = self.CLEARDICT
baos = ""
while True:
if self.max_output_length and len(baos) > self.max_output_length:
raise PdfReadError(
"Output exceeds maximum allowed length (%d bytes) "
"while decoding LZW stream." % self.max_output_length
)
Comment thread
icanhasmath marked this conversation as resolved.
pW = cW
cW = self.next_code()
if cW == -1:
Expand Down
57 changes: 57 additions & 0 deletions Tests/test_security_lzw_hex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# -*- coding: utf-8 -*-
"""
Regression tests for the LZW and ASCIIHex decoder hardening backports:

- CVE-2026-28804 : ASCIIHexDecode quadratic decoding -> bulk decode.
- CVE-2025-62708/66019 : bound LZWDecode output (decompression bomb).
"""
import pytest

from PyPDF2 import filters
from PyPDF2.errors import PdfReadError, PdfStreamError
Comment thread
icanhasmath marked this conversation as resolved.


# --- CVE-2026-28804: ASCIIHexDecode ---------------------------------------

def test_asciihex_basic():
assert filters.ASCIIHexDecode.decode("48656c6c6f>") == b"Hello"
Comment thread
icanhasmath marked this conversation as resolved.


def test_asciihex_ignores_whitespace():
assert filters.ASCIIHexDecode.decode("48 65 6c\n6c\t6f >") == b"Hello"
Comment thread
icanhasmath marked this conversation as resolved.


def test_asciihex_odd_length_padded():
# ISO 32000 §7.4.2: a trailing odd digit is treated as followed by "0".
assert filters.ASCIIHexDecode.decode("4>") == b"@" # 0x40
Comment thread
icanhasmath marked this conversation as resolved.


def test_asciihex_missing_eod_raises():
with pytest.raises(PdfStreamError):
filters.ASCIIHexDecode.decode("48656c6c6f") # no '>'
Comment thread
icanhasmath marked this conversation as resolved.


# --- CVE-2025-62708 / CVE-2025-66019: LZWDecode output cap -----------------

def _pack_lzw(codes, width=9):
"""Pack a list of fixed-width LZW codes MSB-first into bytes.

Kept small so the code width stays at the initial 9 bits (dictlen < 511).
"""
bits = "".join(format(c, "0%db" % width) for c in codes)
while len(bits) % 8:
bits += "0"
return bytes(bytearray(int(bits[i : i + 8], 2) for i in range(0, len(bits), 8)))


def test_lzw_decodes_normally():
# Three literal 'A' (65) codes then STOP (257) -> "AAA".
data = _pack_lzw([65, 65, 65, 257])
assert filters.LZWDecode.Decoder(data).decode() == b"AAA"
Comment thread
icanhasmath marked this conversation as resolved.


def test_lzw_output_is_capped():
# Many literal codes with no STOP; a tiny cap must abort before exhaustion.
data = _pack_lzw([65] * 200)
with pytest.raises(PdfReadError):
filters.LZWDecode.Decoder(data, max_output_length=5).decode()